diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py new file mode 100644 index 000000000..231c3b405 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py @@ -0,0 +1,109 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Golden script for the host_register_mapped demo.""" + +import atexit +import ctypes +import logging + +import numpy as np +import torch + +from simpler.task_interface import free_host_device_share_mem, malloc_host_device_share_mem + +logger = logging.getLogger(__name__) + +__outputs__ = ["mapped_out"] + +RTOL = 1e-5 +ATOL = 1e-5 +LOG_PREVIEW_COUNT = 16 + +ROWS = 128 +COLS = 128 +SIZE = ROWS * COLS + +_MAPPED_STATE = {} + + +def _log_preview(label: str, values) -> None: + flat = np.asarray(values).reshape(-1) + preview = flat[:LOG_PREVIEW_COUNT].tolist() + logger.info("%s first_%d=%s total=%d", label, min(LOG_PREVIEW_COUNT, flat.size), preview, flat.size) + + +def _cleanup_mapped_state() -> None: + host_ptr = _MAPPED_STATE.get("host_ptr", 0) + if not host_ptr: + _MAPPED_STATE.clear() + return + + try: + free_host_device_share_mem(host_ptr) + except Exception as exc: # noqa: BLE001 + logger.warning("free_host_device_share_mem cleanup failed: %s", exc) + + _MAPPED_STATE.clear() + + +atexit.register(_cleanup_mapped_state) + + +def generate_inputs(params: dict) -> list: + del params + _cleanup_mapped_state() + + alloc_size = SIZE * ctypes.sizeof(ctypes.c_float) + host_ptr, mapped_dev_ptr = malloc_host_device_share_mem(alloc_size) + host_buf = (ctypes.c_float * SIZE).from_address(host_ptr) + host_np = np.ctypeslib.as_array(host_buf) + host_np[:] = np.arange(SIZE, dtype=np.float32) + host_tensor = torch.from_numpy(host_np) + _log_preview("host_register_mapped_demo: host_init_data", host_np) + + mapped_out = torch.zeros_like(host_tensor) + + _MAPPED_STATE.update( + { + "host_ptr": host_ptr, + "mapped_dev_ptr": mapped_dev_ptr, + "host_buf": host_buf, + "host_np": host_np, + "host_tensor": host_tensor, + } + ) + + logger.info( + "host_register_mapped_demo: host_ptr=0x%x mapped_dev_ptr=0x%x size=%d", + host_ptr, + mapped_dev_ptr, + host_tensor.numel() * host_tensor.element_size(), + ) + + return [ + ("mapped_out", mapped_out), + ("mapped_dev_ptr", ctypes.c_uint64(mapped_dev_ptr)), + ] + + +def compute_golden(tensors: dict, params: dict) -> None: + del params + host_tensor = _MAPPED_STATE["host_tensor"] + tensors["mapped_out"][:] = host_tensor + 1.0 + + +def post_run_collect(outputs: dict, params: dict) -> None: + del params + host_np = _MAPPED_STATE.get("host_np") + if host_np is not None: + _log_preview("host_register_mapped_demo: host_data_after_run", host_np) + mapped_out = outputs.get("mapped_out") + if mapped_out is not None: + _log_preview("host_register_mapped_demo: device_copy_back_data", mapped_out.detach().cpu().numpy()) + _cleanup_mapped_state() diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp new file mode 100644 index 000000000..7287da767 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Demo kernel: + * mapped_host_buffer[i] = mapped_host_buffer[i] + 1.0f + * out[i] = mapped_host_buffer[i] + 1.0f + * + * The input pointer comes from host_register_mapped(), so a successful result + * shows that the kernel was able to read and write the mapped host buffer + * directly while also producing a regular copy-back output. + */ + +#include +#include + +#include "tensor.h" // NOLINT(build/include_subdir) + +using namespace pto; // NOLINT(build/namespaces) + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#endif + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *mapped_host_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *mapped_host = reinterpret_cast<__gm__ float *>(mapped_host_tensor->buffer.addr) + mapped_host_tensor->start_offset; + __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + constexpr float kAddValue = 1.0f; + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src_tile(vRows, vCols); + TileData dst_tile(vRows, vCols); + TASSIGN(src_tile, 0x0); + TASSIGN(dst_tile, 0x10000); + + GlobalData mapped_host_global(mapped_host); + GlobalData dst_global(out); + + TLOAD(src_tile, mapped_host_global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADDS(dst_tile, src_tile, kAddValue); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(mapped_host_global, dst_tile); + TSTORE(dst_global, dst_tile); + + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py new file mode 100644 index 000000000..43551ae40 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py @@ -0,0 +1,38 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Kernel config for the host_register_mapped demo.""" + +from pathlib import Path + +from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] + +_KERNELS_ROOT = Path(__file__).parent + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "host_register_mapped_orch.cpp"), + "function_name": "aicpu_orchestration_entry", + # ChipStorageTaskArgs stores tensors first and scalars after them. + "signature": [D.OUT, D.SCALAR], +} + +KERNELS = [ + { + "func_id": 0, + "source": str(_KERNELS_ROOT / "aiv" / "kernel_load_add_one.cpp"), + "core_type": "aiv", + "signature": [D.INOUT, D.OUT], + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "block_dim": 3, + "rounds": 1, +} diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp new file mode 100644 index 000000000..f9c9a87ff --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Demo orchestration for host-side mapped memory. + * + * Args layout in ChipStorageTaskArgs: + * tensor(0): mapped_out (host tensor copied back by runtime) + * scalar(0): mapped_dev_ptr (device-visible address returned by host_register_mapped) + * + * The mapped host buffer is wrapped as an external tensor and submitted as + * INOUT so the kernel updates host-visible memory in place. + */ + +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 2, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) { + const ContinuousTensor &out_arg = orch_args.tensor(0); + Tensor mapped_out = from_tensor_arg(out_arg); + + uint64_t mapped_input_u64 = orch_args.scalar(0); + Tensor mapped_host_buffer = make_tensor_external( + reinterpret_cast(static_cast(mapped_input_u64)), out_arg.shapes, out_arg.ndims, out_arg.dtype + ); + + LOG_INFO( + "host_register_mapped_demo: mapped_host_buffer=0x%lx mapped_out=0x%lx elements=%u", mapped_input_u64, out_arg.data, + out_arg.shapes[0] + ); + + Arg params; + params.add_inout(mapped_host_buffer); + params.add_output(mapped_out); + pto2_rt_submit_aiv_task(0, params); +} + +} // extern "C" diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/README.md b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/README.md new file mode 100644 index 000000000..c05c73fa9 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/README.md @@ -0,0 +1,59 @@ +# A5 Host Register Mapped Demo + +这个 demo 用来验证 `a5` 平台上的两件事: +- `mallocHostDeviceShareMem(...)` 可以在 Host 侧申请并注册一段 Device 可访问地址 +- AIV kernel 可以直接读取并写回这段映射内存 + +## 本次 a5 修改点 + +- 在 `src/a5/platform/onboard/host/pto_runtime_c_api.cpp` 中实现了: + - `mallocHostDeviceShareMem(...)` + - `freeHostDeviceShareMem(...)` +- 这两个接口的执行顺序和 `a2a3` 保持一致: + - `GetDevice / SetDevice` + - `MallocHost / FreeHost` + - `HostRegister / HostUnregister` +- Python 侧继续复用通用封装: + - `malloc_host_device_share_mem(...)` + - `free_host_device_share_mem(...)` +- 新增了 `examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo` 用于硬件验证 + +## Demo 行为 + +- Host 通过 `malloc_host_device_share_mem(...)` 拿到: + - `host_ptr` + - `mapped_dev_ptr` +- Host 把 `host_ptr` 初始化为 `0, 1, 2, ...` +- orchestration 把 `mapped_dev_ptr` 包成外部 tensor +- kernel 执行: + - `mapped_host_buffer[i] = mapped_host_buffer[i] + 1` + - `mapped_out[i] = mapped_host_buffer[i] + 1` +- 运行结束后打印: + - 初始 Host 数据 + - 执行后 Host 内存数据 + - 普通 output copy-back 数据 + +## 启动命令 + +在仓库根目录执行: + +```bash +python examples/scripts/run_example.py --build \ + -k examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels \ + -g examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py \ + -p a5 -d 0 +``` + +如果环境已经提前编好了 runtime,也可以去掉 `--build`。 + +## 结果判断 + +成功时建议重点看三组日志: +- `a5_host_register_mapped_demo: host_init_data` +- `a5_host_register_mapped_demo: host_data_after_run` +- `a5_host_register_mapped_demo: device_copy_back_data` + +理想结果是: +- `host_init_data` 为 `0, 1, 2, ...` +- `host_data_after_run` 为 `1, 2, 3, ...` +- `device_copy_back_data` 也为 `1, 2, 3, ...` diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py new file mode 100644 index 000000000..21ad1fbec --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py @@ -0,0 +1,109 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Golden script for the a5 host_register_mapped demo.""" + +import atexit +import ctypes +import logging + +import numpy as np +import torch + +from simpler.task_interface import free_host_device_share_mem, malloc_host_device_share_mem + +logger = logging.getLogger(__name__) + +__outputs__ = ["mapped_out"] + +RTOL = 1e-5 +ATOL = 1e-5 +LOG_PREVIEW_COUNT = 16 + +ROWS = 128 +COLS = 128 +SIZE = ROWS * COLS + +_MAPPED_STATE = {} + + +def _log_preview(label: str, values) -> None: + flat = np.asarray(values).reshape(-1) + preview = flat[:LOG_PREVIEW_COUNT].tolist() + logger.info("%s first_%d=%s total=%d", label, min(LOG_PREVIEW_COUNT, flat.size), preview, flat.size) + + +def _cleanup_mapped_state() -> None: + host_ptr = _MAPPED_STATE.get("host_ptr", 0) + if not host_ptr: + _MAPPED_STATE.clear() + return + + try: + free_host_device_share_mem(host_ptr) + except Exception as exc: # noqa: BLE001 + logger.warning("free_host_device_share_mem cleanup failed: %s", exc) + + _MAPPED_STATE.clear() + + +atexit.register(_cleanup_mapped_state) + + +def generate_inputs(params: dict) -> list: + del params + _cleanup_mapped_state() + + alloc_size = SIZE * ctypes.sizeof(ctypes.c_float) + host_ptr, mapped_dev_ptr = malloc_host_device_share_mem(alloc_size) + host_buf = (ctypes.c_float * SIZE).from_address(host_ptr) + host_np = np.ctypeslib.as_array(host_buf) + host_np[:] = np.arange(SIZE, dtype=np.float32) + host_tensor = torch.from_numpy(host_np) + _log_preview("a5_host_register_mapped_demo: host_init_data", host_np) + + mapped_out = torch.zeros_like(host_tensor) + + _MAPPED_STATE.update( + { + "host_ptr": host_ptr, + "mapped_dev_ptr": mapped_dev_ptr, + "host_buf": host_buf, + "host_np": host_np, + "host_tensor": host_tensor, + } + ) + + logger.info( + "a5_host_register_mapped_demo: host_ptr=0x%x mapped_dev_ptr=0x%x size=%d", + host_ptr, + mapped_dev_ptr, + host_tensor.numel() * host_tensor.element_size(), + ) + + return [ + ("mapped_out", mapped_out), + ("mapped_dev_ptr", ctypes.c_uint64(mapped_dev_ptr)), + ] + + +def compute_golden(tensors: dict, params: dict) -> None: + del params + host_tensor = _MAPPED_STATE["host_tensor"] + tensors["mapped_out"][:] = host_tensor + 1.0 + + +def post_run_collect(outputs: dict, params: dict) -> None: + del params + host_np = _MAPPED_STATE.get("host_np") + if host_np is not None: + _log_preview("a5_host_register_mapped_demo: host_data_after_run", host_np) + mapped_out = outputs.get("mapped_out") + if mapped_out is not None: + _log_preview("a5_host_register_mapped_demo: device_copy_back_data", mapped_out.detach().cpu().numpy()) + _cleanup_mapped_state() diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp new file mode 100644 index 000000000..79e41e6d8 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Demo kernel: + * mapped_host_buffer[i] = mapped_host_buffer[i] + 1.0f + * out[i] = mapped_host_buffer[i] + 1.0f + * + * The input pointer comes from mallocHostDeviceShareMem(), so a successful result + * shows that the kernel was able to read and write the mapped host buffer + * directly while also producing a regular copy-back output. + */ + +#include +#include + +#include "tensor.h" // NOLINT(build/include_subdir) + +using namespace pto; // NOLINT(build/namespaces) + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#endif + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *mapped_host_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *mapped_host = reinterpret_cast<__gm__ float *>(mapped_host_tensor->buffer.addr) + mapped_host_tensor->start_offset; + __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + constexpr float kAddValue = 1.0f; + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src_tile(vRows, vCols); + TileData dst_tile(vRows, vCols); + TASSIGN(src_tile, 0x0); + TASSIGN(dst_tile, 0x10000); + + GlobalData mapped_host_global(mapped_host); + GlobalData dst_global(out); + + TLOAD(src_tile, mapped_host_global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADDS(dst_tile, src_tile, kAddValue); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(mapped_host_global, dst_tile); + TSTORE(dst_global, dst_tile); + + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py new file mode 100644 index 000000000..d50929064 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py @@ -0,0 +1,37 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Kernel config for the a5 host_register_mapped demo.""" + +from pathlib import Path + +from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] + +_KERNELS_ROOT = Path(__file__).parent + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "host_register_mapped_orch.cpp"), + "function_name": "aicpu_orchestration_entry", + "signature": [D.OUT, D.SCALAR], +} + +KERNELS = [ + { + "func_id": 0, + "source": str(_KERNELS_ROOT / "aiv" / "kernel_load_add_one.cpp"), + "core_type": "aiv", + "signature": [D.INOUT, D.OUT], + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "block_dim": 3, + "rounds": 1, +} diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp new file mode 100644 index 000000000..7fe235e45 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Demo orchestration for host-side mapped memory on a5. + * + * Args layout in ChipStorageTaskArgs: + * tensor(0): mapped_out (host tensor copied back by runtime) + * scalar(0): mapped_dev_ptr (device-visible address returned by mallocHostDeviceShareMem) + */ + +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 2, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) { + const ContinuousTensor &out_arg = orch_args.tensor(0); + Tensor mapped_out = from_tensor_arg(out_arg); + + uint64_t mapped_input_u64 = orch_args.scalar(0); + Tensor mapped_host_buffer = make_tensor_external( + reinterpret_cast(static_cast(mapped_input_u64)), out_arg.shapes, out_arg.ndims, out_arg.dtype + ); + + LOG_INFO( + "a5_host_register_mapped_demo: mapped_host_buffer=0x%lx mapped_out=0x%lx elements=%u", + mapped_input_u64, out_arg.data, out_arg.shapes[0] + ); + + Arg params; + params.add_inout(mapped_host_buffer); + params.add_output(mapped_out); + pto2_rt_submit_aiv_task(0, params); +} + +} // extern "C" diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index d949735c3..5df201593 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -560,6 +560,23 @@ NB_MODULE(_task_interface, m) { .def("set_device", &ChipWorker::set_device, nb::arg("device_id")) .def("reset_device", &ChipWorker::reset_device) .def("finalize", &ChipWorker::finalize) + .def( + "malloc_host_device_share_mem", + [](ChipWorker &self, uint64_t size, int device_id) -> nb::tuple { + uint64_t host_ptr = 0; + uint64_t dev_ptr = 0; + self.mallocHostDeviceShareMem(size, &host_ptr, &dev_ptr, device_id); + nb::list lst; + lst.append(host_ptr); + lst.append(dev_ptr); + return nb::tuple(lst); + }, + nb::arg("size"), nb::arg("device_id") = -1 + ) + .def( + "free_host_device_share_mem", &ChipWorker::freeHostDeviceShareMem, nb::arg("host_ptr"), + nb::arg("device_id") = -1 + ) .def( "run", [](ChipWorker &self, const PyChipCallable &callable, ChipStorageTaskArgs &args, diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index d84fa8352..c23e5a1b6 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -57,6 +57,9 @@ "torch_dtype_to_datatype", "make_tensor_arg", "scalar_to_uint64", + "get_active_worker", + "malloc_host_device_share_mem", + "free_host_device_share_mem", # Distributed runtime "WorkerType", "TaskState", @@ -69,6 +72,7 @@ # Lazy-loaded torch dtype → DataType map (avoids importing torch at module load) _TORCH_DTYPE_MAP = None +_ACTIVE_WORKER = None def _ensure_torch_map(): @@ -156,7 +160,9 @@ class ChipWorker: """ def __init__(self): + global _ACTIVE_WORKER self._impl = _ChipWorker() + _ACTIVE_WORKER = self def init(self, host_path, aicpu_path, aicore_path, sim_context_lib_path=""): """Load host runtime library and cache platform binaries. @@ -185,12 +191,28 @@ def reset_device(self): """Release device resources. The runtime binding remains intact.""" self._impl.reset_device() + def malloc_host_device_share_mem(self, size, device_id=None): + """Allocate host memory and register it as a device-visible mapped buffer.""" + if device_id is None: + device_id = self.device_id + host_ptr, dev_ptr = self._impl.malloc_host_device_share_mem(int(size), int(device_id)) + return int(host_ptr), int(dev_ptr) + + def free_host_device_share_mem(self, host_ptr, device_id=None): + """Unregister and free a mapped host buffer.""" + if device_id is None: + device_id = self.device_id + self._impl.free_host_device_share_mem(int(host_ptr), int(device_id)) + def finalize(self): """Tear down everything: device resources and runtime library. Terminal operation — the object cannot be reused after this. """ + global _ACTIVE_WORKER self._impl.finalize() + if _ACTIVE_WORKER is self: + _ACTIVE_WORKER = None def run(self, callable, args, config=None, **kwargs): """Execute a callable synchronously. @@ -218,3 +240,24 @@ def initialized(self): @property def device_set(self): return self._impl.device_set + + +def get_active_worker(): + """Return the most recently created ChipWorker in this process.""" + if _ACTIVE_WORKER is None: + raise RuntimeError("No active ChipWorker is available") + if not _ACTIVE_WORKER.initialized: + raise RuntimeError("The active ChipWorker is not initialized") + if not _ACTIVE_WORKER.device_set: + raise RuntimeError("The active ChipWorker does not have a device set") + return _ACTIVE_WORKER + + +def malloc_host_device_share_mem(size, device_id=None): + """Allocate host memory and register it as a device-visible mapped buffer.""" + return get_active_worker().malloc_host_device_share_mem(size, device_id=device_id) + + +def free_host_device_share_mem(host_ptr, device_id=None): + """Unregister and free a mapped host buffer.""" + get_active_worker().free_host_device_share_mem(host_ptr, device_id=device_id) diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py index 9f51f2d10..0790fbdb1 100644 --- a/simpler_setup/code_runner.py +++ b/simpler_setup/code_runner.py @@ -542,81 +542,103 @@ def _compile_one_kernel(kernel): ) worker.set_device(self.device_id) - # Step 3: Run each parameter set - total_cases = len(self.params_list) - for case_idx, params in enumerate(self.params_list): - logger.info("=" * 60) - logger.info(f"=== Case {case_idx + 1}/{total_cases}: {params} ===") - logger.info("=" * 60) - - # Generate tensors using golden.py - logger.info("=== Generating Inputs ===") - result = self._golden_module.generate_inputs(params) - - if isinstance(result, list): - # New-style: generate_inputs returns flat argument list - orch_args, args, inputs, outputs = self._build_func_args_from_list(result) - tensors = args # args contains all named items; compute_golden receives all - else: - # Legacy: generate_inputs returns dict of tensors - tensors = {k: _to_torch(v) for k, v in result.items()} - orch_args = self._build_func_args(tensors) - inputs, outputs = self._identify_outputs(tensors) - - logger.info(f"Inputs: {list(inputs.keys())}") - logger.info(f"Outputs: {list(outputs.keys())}") - - # Determine actual tensor order for debugging - logger.debug(f"Tensor order: {list(tensors.keys())}") - logger.debug(f"orch_args count: {len(orch_args)}") - - # Build environment for runtime initialization - run_env = _kernel_config_runtime_env(self._kernel_config, self.kernels_dir) - if run_env: - logger.debug(f"Runtime init env overrides: {run_env}") - - # Golden - if not self.skip_golden: - golden = {k: v.clone() for k, v in outputs.items()} - golden_with_inputs = {**inputs, **golden} - _t_golden_start = time.perf_counter() - self._golden_module.compute_golden(golden_with_inputs, params) - _t_golden_end = time.perf_counter() - logger.info(f">>> compute_golden() took {_t_golden_end - _t_golden_start:.3f}s") - - initial_outputs = {k: v.clone() for k, v in outputs.items()} - - for round_idx in range(self.repeat_rounds): - if self.repeat_rounds > 1: - logger.info(f"--- Round {round_idx + 1}/{self.repeat_rounds} ---") - - for k, v in initial_outputs.items(): - outputs[k].copy_(v) - - config = ChipCallConfig() - config.block_dim = self.block_dim - config.aicpu_thread_num = self.aicpu_thread_num - if self.enable_profiling and round_idx == 0: - config.enable_profiling = True - logger.info("Profiling enabled") - if self.enable_dump_tensor: - config.enable_dump_tensor = True - logger.info("Dump tensor enabled") - - with _temporary_env(run_env): - worker.run(chip_callable, orch_args, config) - - if not self.skip_golden: - self._compare_with_golden(outputs, golden) - - logger.info(f"=== Case {case_idx + 1}/{total_cases} Passed ===") - - worker.reset_device() - worker.finalize() + try: + # Step 3: Run each parameter set + total_cases = len(self.params_list) + for case_idx, params in enumerate(self.params_list): + logger.info("=" * 60) + logger.info(f"=== Case {case_idx + 1}/{total_cases}: {params} ===") + logger.info("=" * 60) + + outputs = {} + try: + # Generate tensors using golden.py + logger.info("=== Generating Inputs ===") + result = self._golden_module.generate_inputs(params) + + if isinstance(result, list): + # New-style: generate_inputs returns flat argument list + orch_args, args, inputs, outputs = self._build_func_args_from_list(result) + tensors = args # args contains all named items; compute_golden receives all + else: + # Legacy: generate_inputs returns dict of tensors + tensors = {k: _to_torch(v) for k, v in result.items()} + orch_args = self._build_func_args(tensors) + inputs, outputs = self._identify_outputs(tensors) + + logger.info(f"Inputs: {list(inputs.keys())}") + logger.info(f"Outputs: {list(outputs.keys())}") + + # Determine actual tensor order for debugging + logger.debug(f"Tensor order: {list(tensors.keys())}") + logger.debug(f"orch_args count: {len(orch_args)}") + + # Build environment for runtime initialization + run_env = _kernel_config_runtime_env(self._kernel_config, self.kernels_dir) + if run_env: + logger.debug(f"Runtime init env overrides: {run_env}") + + # Golden + if not self.skip_golden: + golden = {k: v.clone() for k, v in outputs.items()} + golden_with_inputs = {**inputs, **golden} + _t_golden_start = time.perf_counter() + self._golden_module.compute_golden(golden_with_inputs, params) + _t_golden_end = time.perf_counter() + logger.info(f">>> compute_golden() took {_t_golden_end - _t_golden_start:.3f}s") + + initial_outputs = {k: v.clone() for k, v in outputs.items()} + + for round_idx in range(self.repeat_rounds): + if self.repeat_rounds > 1: + logger.info(f"--- Round {round_idx + 1}/{self.repeat_rounds} ---") + + for k, v in initial_outputs.items(): + outputs[k].copy_(v) + + config = ChipCallConfig() + config.block_dim = self.block_dim + config.aicpu_thread_num = self.aicpu_thread_num + if self.enable_profiling and round_idx == 0: + config.enable_profiling = True + logger.info("Profiling enabled") + if self.enable_dump_tensor: + config.enable_dump_tensor = True + logger.info("Dump tensor enabled") + + with _temporary_env(run_env): + worker.run(chip_callable, orch_args, config) + + if not self.skip_golden: + self._compare_with_golden(outputs, golden) + + logger.info(f"=== Case {case_idx + 1}/{total_cases} Passed ===") + finally: + self._run_post_run_collect(outputs, params) + finally: + worker.reset_device() + worker.finalize() + logger.info("=" * 60) logger.info(f"=== All {total_cases} cases passed ===") logger.info("=" * 60) + def _run_post_run_collect( + self, + outputs: dict[str, torch.Tensor], + params: dict[str, Any], + ) -> None: + """ + Optional post-run hook. + + If golden.py defines post_run_collect(outputs, params), call it after + the case completes so custom cleanup hooks can release external state. + """ + collect_fn = getattr(self._golden_module, "post_run_collect", None) + if not callable(collect_fn): + return + collect_fn(outputs, params) + def _compare_with_golden( self, outputs: dict[str, torch.Tensor], diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index 2808152eb..c12a663d8 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -20,6 +20,9 @@ #include "callable.h" #include "task_args.h" +#include +#include +#include #include #include @@ -28,6 +31,88 @@ #include "host/raii_scope_guard.h" #include "runtime.h" +namespace { + +using RtMallocHostFn = int (*)(void **, uint64_t, uint32_t); +using RtFreeHostFn = int (*)(void *); +using RtHostRegisterFn = int (*)(void *, uint64_t, uint32_t, void **); +using RtHostUnregisterFn = int (*)(void *); +using AclMallocHostFn = int (*)(void **, size_t); +using AclFreeHostFn = int (*)(void *); +using AclHostRegisterFn = int (*)(void *, uint64_t, uint32_t, void **); +using AclHostUnregisterFn = int (*)(void *); +using GetDeviceFn = int (*)(int32_t *); +using DirectSetDeviceFn = int (*)(int32_t); + +template +Fn resolve_symbol(const char **resolved_name, std::initializer_list names) { + for (const char *name : names) { + dlerror(); + void *sym = dlsym(RTLD_DEFAULT, name); + const char *err = dlerror(); + if (err == nullptr && sym != nullptr) { + if (resolved_name != nullptr) { + *resolved_name = name; + } + return reinterpret_cast(sym); + } + } + if (resolved_name != nullptr) { + *resolved_name = nullptr; + } + return nullptr; +} + +static constexpr uint32_t kHostRegisterMappedFlag = +#if defined(RT_HOST_REGISTER_MAPPED) + RT_HOST_REGISTER_MAPPED; +#elif defined(ACL_HOST_REGISTER_MAPPED) + ACL_HOST_REGISTER_MAPPED; +#else + 0U; +#endif + +int ensure_current_device_for_share_mem(uint32_t device_id) { + const char *symbol_name = nullptr; + if (GetDeviceFn get_device_fn = resolve_symbol(&symbol_name, {"aclrtGetDevice", "rtGetDevice"})) { + int32_t current_device = -1; + int rc = get_device_fn(¤t_device); + if (rc != 0) { + LOG_INFO( + "ensure_current_device_for_share_mem: %s failed rc=%d, trying to set device to %u", + symbol_name, rc, device_id + ); + if (DirectSetDeviceFn set_device_fn = + resolve_symbol(&symbol_name, {"rtSetDevice", "aclrtSetDevice"})) { + rc = set_device_fn(static_cast(device_id)); + if (rc != 0) { + LOG_ERROR( + "ensure_current_device_for_share_mem: %s(%u) failed: rc=%d", symbol_name, device_id, rc + ); + return rc; + } + return 0; + } + LOG_ERROR("ensure_current_device_for_share_mem: missing symbols rtSetDevice / aclrtSetDevice"); + return rc; + } + + if (current_device != static_cast(device_id)) { + LOG_ERROR( + "ensure_current_device_for_share_mem: current device %d does not match requested device %u", + static_cast(current_device), device_id + ); + return -1; + } + return 0; + } + + LOG_ERROR("ensure_current_device_for_share_mem: missing symbols aclrtGetDevice / rtGetDevice"); + return -1; +} + +} // namespace + extern "C" { /* =========================================================================== @@ -114,8 +199,141 @@ void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx)->attach_current_thread(device_id); + } catch (...) { + return -1; + } +} + +int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr) { + if (hostPtr == NULL || devPtr == NULL || size == 0) { + return -1; + } + + *hostPtr = nullptr; + *devPtr = nullptr; + + int rc = ensure_current_device_for_share_mem(deviceId); + if (rc != 0) { + return rc; + } + + void *allocated_host_ptr = nullptr; + const char *symbol_name = nullptr; + if (RtMallocHostFn malloc_fn = resolve_symbol(&symbol_name, {"rtMallocHost"})) { + rc = malloc_fn(&allocated_host_ptr, size, 0U); + if (rc != 0 || allocated_host_ptr == nullptr) { + LOG_ERROR("mallocHostDeviceShareMem via %s failed on rtMallocHost: rc=%d size=%" PRIu64, symbol_name, rc, size); + return (rc != 0) ? rc : -1; + } + } else if (AclMallocHostFn malloc_fn = resolve_symbol(&symbol_name, {"aclrtMallocHost"})) { + rc = malloc_fn(&allocated_host_ptr, static_cast(size)); + if (rc != 0 || allocated_host_ptr == nullptr) { + LOG_ERROR("mallocHostDeviceShareMem via %s failed on aclrtMallocHost: rc=%d size=%" PRIu64, symbol_name, rc, size); + return (rc != 0) ? rc : -1; + } + } else { + LOG_ERROR("mallocHostDeviceShareMem: missing symbols rtMallocHost / aclrtMallocHost"); + return -1; + } + + if (RtHostRegisterFn register_fn = + resolve_symbol(&symbol_name, {"rtsHostRegister", "rtHostRegister"})) { + rc = register_fn(allocated_host_ptr, size, kHostRegisterMappedFlag, devPtr); + if (rc != 0 || *devPtr == nullptr) { + LOG_ERROR( + "mallocHostDeviceShareMem via %s failed on host register: rc=%d host=%p size=%" PRIu64 " flag=%u", + symbol_name, rc, allocated_host_ptr, size, kHostRegisterMappedFlag + ); + if (RtFreeHostFn free_fn = resolve_symbol(nullptr, {"rtFreeHost"})) { + free_fn(allocated_host_ptr); + } else if (AclFreeHostFn free_fn = resolve_symbol(nullptr, {"aclrtFreeHost"})) { + free_fn(allocated_host_ptr); + } + return (rc != 0) ? rc : -1; + } + } else if (AclHostRegisterFn register_fn = + resolve_symbol(&symbol_name, {"aclrtHostRegister"})) { + rc = register_fn(allocated_host_ptr, size, kHostRegisterMappedFlag, devPtr); + if (rc != 0 || *devPtr == nullptr) { + LOG_ERROR( + "mallocHostDeviceShareMem via %s failed on host register: rc=%d host=%p size=%" PRIu64 " flag=%u", + symbol_name, rc, allocated_host_ptr, size, kHostRegisterMappedFlag + ); + if (RtFreeHostFn free_fn = resolve_symbol(nullptr, {"rtFreeHost"})) { + free_fn(allocated_host_ptr); + } else if (AclFreeHostFn free_fn = resolve_symbol(nullptr, {"aclrtFreeHost"})) { + free_fn(allocated_host_ptr); + } + return (rc != 0) ? rc : -1; + } + } else { + LOG_ERROR("mallocHostDeviceShareMem: missing symbols rtsHostRegister / rtHostRegister / aclrtHostRegister"); + if (RtFreeHostFn free_fn = resolve_symbol(nullptr, {"rtFreeHost"})) { + free_fn(allocated_host_ptr); + } else if (AclFreeHostFn free_fn = resolve_symbol(nullptr, {"aclrtFreeHost"})) { + free_fn(allocated_host_ptr); + } + return -1; + } + + *hostPtr = allocated_host_ptr; + LOG_INFO( + "mallocHostDeviceShareMem: device=%u host=%p dev=%p size=%" PRIu64, deviceId, *hostPtr, *devPtr, size + ); + return 0; +} + +int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr) { + if (hostPtr == NULL) { + return 0; + } + + int rc = ensure_current_device_for_share_mem(deviceId); + if (rc != 0) { + return rc; + } + + const char *symbol_name = nullptr; + if (RtHostUnregisterFn unregister_fn = + resolve_symbol(&symbol_name, {"rtsHostUnregister", "rtHostUnregister"})) { + rc = unregister_fn(hostPtr); + if (rc != 0) { + LOG_ERROR("freeHostDeviceShareMem via %s failed on unregister: rc=%d host=%p", symbol_name, rc, hostPtr); + return rc; + } + } else if (AclHostUnregisterFn unregister_fn = + resolve_symbol(&symbol_name, {"aclrtHostUnregister"})) { + rc = unregister_fn(hostPtr); + if (rc != 0) { + LOG_ERROR("freeHostDeviceShareMem via %s failed on unregister: rc=%d host=%p", symbol_name, rc, hostPtr); + return rc; + } + } else { + LOG_ERROR("freeHostDeviceShareMem: missing symbols rtsHostUnregister / rtHostUnregister / aclrtHostUnregister"); + return -1; + } + + if (RtFreeHostFn free_fn = resolve_symbol(&symbol_name, {"rtFreeHost"})) { + rc = free_fn(hostPtr); + if (rc != 0) { + LOG_ERROR("freeHostDeviceShareMem via %s failed on free: rc=%d host=%p", symbol_name, rc, hostPtr); + return rc; + } + } else if (AclFreeHostFn free_fn = resolve_symbol(&symbol_name, {"aclrtFreeHost"})) { + rc = free_fn(hostPtr); + if (rc != 0) { + LOG_ERROR("freeHostDeviceShareMem via %s failed on free: rc=%d host=%p", symbol_name, rc, hostPtr); + return rc; + } + } else { + LOG_ERROR("freeHostDeviceShareMem: missing symbols rtFreeHost / aclrtFreeHost"); + return -1; + } + + LOG_INFO("freeHostDeviceShareMem: device=%u host=%p", deviceId, hostPtr); return 0; } diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 3e7dfd89e..4d6fc6fa6 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -121,6 +121,26 @@ int set_device(DeviceContextHandle ctx, int device_id) { return 0; } +int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr) { + (void)deviceId; + (void)size; + if (hostPtr != nullptr) { + *hostPtr = nullptr; + } + if (devPtr != nullptr) { + *devPtr = nullptr; + } + LOG_ERROR("mallocHostDeviceShareMem is not supported on a2a3sim"); + return -1; +} + +int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr) { + (void)deviceId; + (void)hostPtr; + LOG_ERROR("freeHostDeviceShareMem is not supported on a2a3sim"); + return -1; +} + int run_runtime( DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index 2808152eb..8275c69af 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -20,6 +20,9 @@ #include "callable.h" #include "task_args.h" +#include +#include +#include #include #include @@ -28,6 +31,88 @@ #include "host/raii_scope_guard.h" #include "runtime.h" +namespace { + +using RtMallocHostFn = int (*)(void **, uint64_t, uint32_t); +using RtFreeHostFn = int (*)(void *); +using RtHostRegisterFn = int (*)(void *, uint64_t, uint32_t, void **); +using RtHostUnregisterFn = int (*)(void *); +using AclMallocHostFn = int (*)(void **, size_t); +using AclFreeHostFn = int (*)(void *); +using AclHostRegisterFn = int (*)(void *, uint64_t, uint32_t, void **); +using AclHostUnregisterFn = int (*)(void *); +using GetDeviceFn = int (*)(int32_t *); +using DirectSetDeviceFn = int (*)(int32_t); + +template +Fn resolve_symbol(const char **resolved_name, std::initializer_list names) { + for (const char *name : names) { + dlerror(); + void *sym = dlsym(RTLD_DEFAULT, name); + const char *err = dlerror(); + if (err == nullptr && sym != nullptr) { + if (resolved_name != nullptr) { + *resolved_name = name; + } + return reinterpret_cast(sym); + } + } + if (resolved_name != nullptr) { + *resolved_name = nullptr; + } + return nullptr; +} + +static constexpr uint32_t kHostRegisterMappedFlag = +#if defined(RT_HOST_REGISTER_MAPPED) + RT_HOST_REGISTER_MAPPED; +#elif defined(ACL_HOST_REGISTER_MAPPED) + ACL_HOST_REGISTER_MAPPED; +#else + 0U; +#endif + +int ensure_current_device_for_share_mem(uint32_t device_id) { + const char *symbol_name = nullptr; + if (GetDeviceFn get_device_fn = resolve_symbol(&symbol_name, {"aclrtGetDevice", "rtGetDevice"})) { + int32_t current_device = -1; + int rc = get_device_fn(¤t_device); + if (rc != 0) { + LOG_INFO( + "ensure_current_device_for_share_mem: %s failed rc=%d, trying to set device to %u", + symbol_name, rc, device_id + ); + if (DirectSetDeviceFn set_device_fn = + resolve_symbol(&symbol_name, {"rtSetDevice", "aclrtSetDevice"})) { + rc = set_device_fn(static_cast(device_id)); + if (rc != 0) { + LOG_ERROR( + "ensure_current_device_for_share_mem: %s(%u) failed: rc=%d", symbol_name, device_id, rc + ); + return rc; + } + return 0; + } + LOG_ERROR("ensure_current_device_for_share_mem: missing symbols rtSetDevice / aclrtSetDevice"); + return rc; + } + + if (current_device != static_cast(device_id)) { + LOG_ERROR( + "ensure_current_device_for_share_mem: current device %d does not match requested device %u", + static_cast(current_device), device_id + ); + return -1; + } + return 0; + } + + LOG_ERROR("ensure_current_device_for_share_mem: missing symbols aclrtGetDevice / rtGetDevice"); + return -1; +} + +} // namespace + extern "C" { /* =========================================================================== @@ -114,8 +199,144 @@ void destroy_device_context(DeviceContextHandle ctx) { delete static_cast(ctx)->attach_current_thread(device_id); + } catch (...) { + return -1; + } +} + +int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr) { + if (hostPtr == NULL || devPtr == NULL || size == 0) { + return -1; + } + + *hostPtr = nullptr; + *devPtr = nullptr; + + int rc = ensure_current_device_for_share_mem(deviceId); + if (rc != 0) { + return rc; + } + + void *allocated_host_ptr = nullptr; + const char *symbol_name = nullptr; + if (RtMallocHostFn malloc_fn = resolve_symbol(&symbol_name, {"rtMallocHost"})) { + rc = malloc_fn(&allocated_host_ptr, size, 0U); + if (rc != 0 || allocated_host_ptr == nullptr) { + LOG_ERROR("mallocHostDeviceShareMem via %s failed on rtMallocHost: rc=%d size=%" PRIu64, symbol_name, rc, size); + return (rc != 0) ? rc : -1; + } + } else if (AclMallocHostFn malloc_fn = resolve_symbol(&symbol_name, {"aclrtMallocHost"})) { + rc = malloc_fn(&allocated_host_ptr, static_cast(size)); + if (rc != 0 || allocated_host_ptr == nullptr) { + LOG_ERROR( + "mallocHostDeviceShareMem via %s failed on aclrtMallocHost: rc=%d size=%" PRIu64, symbol_name, rc, + size + ); + return (rc != 0) ? rc : -1; + } + } else { + LOG_ERROR("mallocHostDeviceShareMem: missing symbols rtMallocHost / aclrtMallocHost"); + return -1; + } + + if (RtHostRegisterFn register_fn = + resolve_symbol(&symbol_name, {"rtsHostRegister", "rtHostRegister"})) { + rc = register_fn(allocated_host_ptr, size, kHostRegisterMappedFlag, devPtr); + if (rc != 0 || *devPtr == nullptr) { + LOG_ERROR( + "mallocHostDeviceShareMem via %s failed on host register: rc=%d host=%p size=%" PRIu64 " flag=%u", + symbol_name, rc, allocated_host_ptr, size, kHostRegisterMappedFlag + ); + if (RtFreeHostFn free_fn = resolve_symbol(nullptr, {"rtFreeHost"})) { + free_fn(allocated_host_ptr); + } else if (AclFreeHostFn free_fn = resolve_symbol(nullptr, {"aclrtFreeHost"})) { + free_fn(allocated_host_ptr); + } + return (rc != 0) ? rc : -1; + } + } else if (AclHostRegisterFn register_fn = + resolve_symbol(&symbol_name, {"aclrtHostRegister"})) { + rc = register_fn(allocated_host_ptr, size, kHostRegisterMappedFlag, devPtr); + if (rc != 0 || *devPtr == nullptr) { + LOG_ERROR( + "mallocHostDeviceShareMem via %s failed on host register: rc=%d host=%p size=%" PRIu64 " flag=%u", + symbol_name, rc, allocated_host_ptr, size, kHostRegisterMappedFlag + ); + if (RtFreeHostFn free_fn = resolve_symbol(nullptr, {"rtFreeHost"})) { + free_fn(allocated_host_ptr); + } else if (AclFreeHostFn free_fn = resolve_symbol(nullptr, {"aclrtFreeHost"})) { + free_fn(allocated_host_ptr); + } + return (rc != 0) ? rc : -1; + } + } else { + LOG_ERROR("mallocHostDeviceShareMem: missing symbols rtsHostRegister / rtHostRegister / aclrtHostRegister"); + if (RtFreeHostFn free_fn = resolve_symbol(nullptr, {"rtFreeHost"})) { + free_fn(allocated_host_ptr); + } else if (AclFreeHostFn free_fn = resolve_symbol(nullptr, {"aclrtFreeHost"})) { + free_fn(allocated_host_ptr); + } + return -1; + } + + *hostPtr = allocated_host_ptr; + LOG_INFO( + "mallocHostDeviceShareMem: device=%u host=%p dev=%p size=%" PRIu64, deviceId, *hostPtr, *devPtr, size + ); + return 0; +} + +int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr) { + if (hostPtr == NULL) { + return 0; + } + + int rc = ensure_current_device_for_share_mem(deviceId); + if (rc != 0) { + return rc; + } + + const char *symbol_name = nullptr; + if (RtHostUnregisterFn unregister_fn = + resolve_symbol(&symbol_name, {"rtsHostUnregister", "rtHostUnregister"})) { + rc = unregister_fn(hostPtr); + if (rc != 0) { + LOG_ERROR("freeHostDeviceShareMem via %s failed on unregister: rc=%d host=%p", symbol_name, rc, hostPtr); + return rc; + } + } else if (AclHostUnregisterFn unregister_fn = + resolve_symbol(&symbol_name, {"aclrtHostUnregister"})) { + rc = unregister_fn(hostPtr); + if (rc != 0) { + LOG_ERROR("freeHostDeviceShareMem via %s failed on unregister: rc=%d host=%p", symbol_name, rc, hostPtr); + return rc; + } + } else { + LOG_ERROR("freeHostDeviceShareMem: missing symbols rtsHostUnregister / rtHostUnregister / aclrtHostUnregister"); + return -1; + } + + if (RtFreeHostFn free_fn = resolve_symbol(&symbol_name, {"rtFreeHost"})) { + rc = free_fn(hostPtr); + if (rc != 0) { + LOG_ERROR("freeHostDeviceShareMem via %s failed on free: rc=%d host=%p", symbol_name, rc, hostPtr); + return rc; + } + } else if (AclFreeHostFn free_fn = resolve_symbol(&symbol_name, {"aclrtFreeHost"})) { + rc = free_fn(hostPtr); + if (rc != 0) { + LOG_ERROR("freeHostDeviceShareMem via %s failed on free: rc=%d host=%p", symbol_name, rc, hostPtr); + return rc; + } + } else { + LOG_ERROR("freeHostDeviceShareMem: missing symbols rtFreeHost / aclrtFreeHost"); + return -1; + } + + LOG_INFO("freeHostDeviceShareMem: device=%u host=%p", deviceId, hostPtr); return 0; } diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 3e7dfd89e..52f1f57c6 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -121,6 +121,26 @@ int set_device(DeviceContextHandle ctx, int device_id) { return 0; } +int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr) { + (void)deviceId; + (void)size; + if (hostPtr != nullptr) { + *hostPtr = nullptr; + } + if (devPtr != nullptr) { + *devPtr = nullptr; + } + LOG_ERROR("mallocHostDeviceShareMem is not supported on a5sim"); + return -1; +} + +int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr) { + (void)deviceId; + (void)hostPtr; + LOG_ERROR("freeHostDeviceShareMem is not supported on a5sim"); + return -1; +} + int run_runtime( DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index e919fcc27..e86bf601e 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -37,6 +37,14 @@ T load_symbol(void *handle, const char *name) { return reinterpret_cast(sym); } +template +T try_load_symbol(void *handle, const char *name) { + dlerror(); // clear any existing error + void *sym = dlsym(handle, name); + (void)dlerror(); + return reinterpret_cast(sym); +} + // Process-wide singleton: libcpu_sim_context.so is loaded once with // RTLD_GLOBAL so that host_runtime.so can resolve sim_context_set_* and // pto_sim_get_* symbols at runtime. Never dlclosed. @@ -117,6 +125,9 @@ void ChipWorker::init( get_runtime_size_fn_ = load_symbol(handle, "get_runtime_size"); run_runtime_fn_ = load_symbol(handle, "run_runtime"); finalize_device_fn_ = load_symbol(handle, "finalize_device"); + malloc_host_device_share_mem_fn_ = + try_load_symbol(handle, "mallocHostDeviceShareMem"); + free_host_device_share_mem_fn_ = try_load_symbol(handle, "freeHostDeviceShareMem"); } catch (...) { dlclose(handle); throw; @@ -164,6 +175,49 @@ void ChipWorker::reset_device() { device_set_ = false; } +void ChipWorker::mallocHostDeviceShareMem(uint64_t size, uint64_t *host_ptr, uint64_t *dev_ptr, int device_id) { + if (!device_set_) { + throw std::runtime_error("ChipWorker device not set; call set_device() first"); + } + if (host_ptr == nullptr || dev_ptr == nullptr) { + throw std::runtime_error("mallocHostDeviceShareMem requires non-null output pointers"); + } + if (malloc_host_device_share_mem_fn_ == nullptr) { + throw std::runtime_error("mallocHostDeviceShareMem symbol is not available in the bound runtime"); + } + + *host_ptr = 0; + *dev_ptr = 0; + int effective_device_id = (device_id >= 0) ? device_id : device_id_; + void *host_ptr_raw = nullptr; + void *dev_ptr_raw = nullptr; + int rc = malloc_host_device_share_mem_fn_(static_cast(effective_device_id), size, &host_ptr_raw, &dev_ptr_raw); + if (rc != 0 || host_ptr_raw == nullptr || dev_ptr_raw == nullptr) { + throw std::runtime_error("mallocHostDeviceShareMem failed with code " + std::to_string(rc)); + } + + *host_ptr = reinterpret_cast(host_ptr_raw); + *dev_ptr = reinterpret_cast(dev_ptr_raw); +} + +void ChipWorker::freeHostDeviceShareMem(uint64_t host_ptr, int device_id) { + if (host_ptr == 0) { + return; + } + if (!device_set_) { + throw std::runtime_error("ChipWorker device not set; call set_device() first"); + } + if (free_host_device_share_mem_fn_ == nullptr) { + throw std::runtime_error("freeHostDeviceShareMem symbol is not available in the bound runtime"); + } + + int effective_device_id = (device_id >= 0) ? device_id : device_id_; + int rc = free_host_device_share_mem_fn_(static_cast(effective_device_id), reinterpret_cast(host_ptr)); + if (rc != 0) { + throw std::runtime_error("freeHostDeviceShareMem failed with code " + std::to_string(rc)); + } +} + void ChipWorker::finalize() { reset_device(); if (device_ctx_ != nullptr && destroy_device_context_fn_ != nullptr) { @@ -180,6 +234,8 @@ void ChipWorker::finalize() { get_runtime_size_fn_ = nullptr; run_runtime_fn_ = nullptr; finalize_device_fn_ = nullptr; + malloc_host_device_share_mem_fn_ = nullptr; + free_host_device_share_mem_fn_ = nullptr; runtime_buf_.clear(); aicpu_binary_.clear(); aicore_binary_.clear(); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index 2a436c821..155840a38 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -43,6 +43,12 @@ class ChipWorker : public IWorker { /// After this, set_device() can be called again with a new device ID. void reset_device(); + /// Allocate host memory and register it as a device-visible mapped buffer. + void mallocHostDeviceShareMem(uint64_t size, uint64_t *host_ptr, uint64_t *dev_ptr, int device_id = -1); + + /// Unregister and free a mapped host buffer. + void freeHostDeviceShareMem(uint64_t host_ptr, int device_id = -1); + /// Tear down everything: device resources and runtime library. /// Terminal — the object cannot be reused after this. void finalize(); @@ -70,6 +76,8 @@ class ChipWorker : public IWorker { int, int ); using FinalizeDeviceFn = int (*)(void *); + using MallocHostDeviceShareMemFn = int (*)(uint32_t, uint64_t, void **, void **); + using FreeHostDeviceShareMemFn = int (*)(uint32_t, void *); void *lib_handle_ = nullptr; CreateDeviceContextFn create_device_context_fn_ = nullptr; @@ -78,6 +86,8 @@ class ChipWorker : public IWorker { GetRuntimeSizeFn get_runtime_size_fn_ = nullptr; RunRuntimeFn run_runtime_fn_ = nullptr; FinalizeDeviceFn finalize_device_fn_ = nullptr; + MallocHostDeviceShareMemFn malloc_host_device_share_mem_fn_ = nullptr; + FreeHostDeviceShareMemFn free_host_device_share_mem_fn_ = nullptr; void *device_ctx_ = nullptr; std::vector runtime_buf_; diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index f8a811d94..dc6c334ee 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -17,7 +17,8 @@ * * Public API — resolved by ChipWorker via dlsym: * create_device_context, destroy_device_context, - * get_runtime_size, set_device, run_runtime, finalize_device + * get_runtime_size, set_device, run_runtime, finalize_device, + * mallocHostDeviceShareMem, freeHostDeviceShareMem * * Memory management: caller allocates a buffer of get_runtime_size() bytes * and passes it to run_runtime(). Error codes: 0 = success, negative = error. @@ -59,6 +60,26 @@ size_t get_runtime_size(void); /** Set the target device. Must be called before the first run_runtime(). */ int set_device(DeviceContextHandle ctx, int device_id); +/** + * Allocate host memory and register it as a device-visible mapped address. + * + * @param deviceId Target device ID + * @param size Size in bytes + * @param hostPtr Output host pointer + * @param devPtr Output mapped device-visible address + * @return 0 on success, negative on error + */ +int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr); + +/** + * Unregister and free host memory previously created by mallocHostDeviceShareMem(). + * + * @param deviceId Target device ID + * @param hostPtr Host pointer previously returned by mallocHostDeviceShareMem() + * @return 0 on success, negative on error + */ +int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr); + /** * Build the task graph, execute on device, copy results back, and clean up. *