hw-native-sys · sunkaixuan2018 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 16, 2026
diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py
@@ -0,0 +1,109 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Golden script for the host_register_mapped demo."""
+
+import atexit
+import ctypes
+import logging
+
+import numpy as np
+import torch
+
+from simpler.task_interface import free_host_device_share_mem, malloc_host_device_share_mem
+
+logger = logging.getLogger(__name__)
+
+__outputs__ = ["mapped_out"]
+
+RTOL = 1e-5
+ATOL = 1e-5
+LOG_PREVIEW_COUNT = 16
+
+ROWS = 128
+COLS = 128
+SIZE = ROWS * COLS
+
+_MAPPED_STATE = {}
+
+
+def _log_preview(label: str, values) -> None:
+    flat = np.asarray(values).reshape(-1)
+    preview = flat[:LOG_PREVIEW_COUNT].tolist()
+    logger.info("%s first_%d=%s total=%d", label, min(LOG_PREVIEW_COUNT, flat.size), preview, flat.size)
+
+
+def _cleanup_mapped_state() -> None:
+    host_ptr = _MAPPED_STATE.get("host_ptr", 0)
+    if not host_ptr:
+        _MAPPED_STATE.clear()
+        return
+
+    try:
+        free_host_device_share_mem(host_ptr)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("free_host_device_share_mem cleanup failed: %s", exc)
+
+    _MAPPED_STATE.clear()
+
+
+atexit.register(_cleanup_mapped_state)
+
+
+def generate_inputs(params: dict) -> list:
+    del params
+    _cleanup_mapped_state()
+
+    alloc_size = SIZE * ctypes.sizeof(ctypes.c_float)
+    host_ptr, mapped_dev_ptr = malloc_host_device_share_mem(alloc_size)
+    host_buf = (ctypes.c_float * SIZE).from_address(host_ptr)
+    host_np = np.ctypeslib.as_array(host_buf)
+    host_np[:] = np.arange(SIZE, dtype=np.float32)
+    host_tensor = torch.from_numpy(host_np)
+    _log_preview("host_register_mapped_demo: host_init_data", host_np)
+
+    mapped_out = torch.zeros_like(host_tensor)
+
+    _MAPPED_STATE.update(
+        {
+            "host_ptr": host_ptr,
+            "mapped_dev_ptr": mapped_dev_ptr,
+            "host_buf": host_buf,
+            "host_np": host_np,
+            "host_tensor": host_tensor,
+        }
+    )
+
+    logger.info(
+        "host_register_mapped_demo: host_ptr=0x%x mapped_dev_ptr=0x%x size=%d",
+        host_ptr,
+        mapped_dev_ptr,
+        host_tensor.numel() * host_tensor.element_size(),
+    )
+
+    return [
+        ("mapped_out", mapped_out),
+        ("mapped_dev_ptr", ctypes.c_uint64(mapped_dev_ptr)),
+    ]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    del params
+    host_tensor = _MAPPED_STATE["host_tensor"]
+    tensors["mapped_out"][:] = host_tensor + 1.0
+
+
+def post_run_collect(outputs: dict, params: dict) -> None:
+    del params
+    host_np = _MAPPED_STATE.get("host_np")
+    if host_np is not None:
+        _log_preview("host_register_mapped_demo: host_data_after_run", host_np)
+    mapped_out = outputs.get("mapped_out")
+    if mapped_out is not None:
+        _log_preview("host_register_mapped_demo: device_copy_back_data", mapped_out.detach().cpu().numpy())
+    _cleanup_mapped_state()
diff --git a/...a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp b/...a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Demo kernel:
+ *   mapped_host_buffer[i] = mapped_host_buffer[i] + 1.0f
+ *   out[i] = mapped_host_buffer[i] + 1.0f
+ *
+ * The input pointer comes from host_register_mapped(), so a successful result
+ * shows that the kernel was able to read and write the mapped host buffer
+ * directly while also producing a regular copy-back output.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"  // NOLINT(build/include_subdir)
+
+using namespace pto;  // NOLINT(build/namespaces)
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mapped_host_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *mapped_host = reinterpret_cast<__gm__ float *>(mapped_host_tensor->buffer.addr) + mapped_host_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    constexpr float kAddValue = 1.0f;
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData src_tile(vRows, vCols);
+    TileData dst_tile(vRows, vCols);
+    TASSIGN(src_tile, 0x0);
+    TASSIGN(dst_tile, 0x10000);
+
+    GlobalData mapped_host_global(mapped_host);
+    GlobalData dst_global(out);
+
+    TLOAD(src_tile, mapped_host_global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADDS(dst_tile, src_tile, kAddValue);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mapped_host_global, dst_tile);
+    TSTORE(dst_global, dst_tile);
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py
@@ -0,0 +1,38 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Kernel config for the host_register_mapped demo."""
+
+from pathlib import Path
+
+from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
+
+_KERNELS_ROOT = Path(__file__).parent
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "host_register_mapped_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+    # ChipStorageTaskArgs stores tensors first and scalars after them.
+    "signature": [D.OUT, D.SCALAR],
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "source": str(_KERNELS_ROOT / "aiv" / "kernel_load_add_one.cpp"),
+        "core_type": "aiv",
+        "signature": [D.INOUT, D.OUT],
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "block_dim": 3,
+    "rounds": 1,
+}
diff --git a/..._ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp b/..._ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Demo orchestration for host-side mapped memory.
+ *
+ * Args layout in ChipStorageTaskArgs:
+ *   tensor(0): mapped_out (host tensor copied back by runtime)
+ *   scalar(0): mapped_dev_ptr (device-visible address returned by host_register_mapped)
+ *
+ * The mapped host buffer is wrapped as an external tensor and submitted as
+ * INOUT so the kernel updates host-visible memory in place.
+ */
+
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 2,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+    const ContinuousTensor &out_arg = orch_args.tensor(0);
+    Tensor mapped_out = from_tensor_arg(out_arg);
+
+    uint64_t mapped_input_u64 = orch_args.scalar(0);
+    Tensor mapped_host_buffer = make_tensor_external(
+        reinterpret_cast<void *>(static_cast<uintptr_t>(mapped_input_u64)), out_arg.shapes, out_arg.ndims, out_arg.dtype
+    );
+
+    LOG_INFO(
+        "host_register_mapped_demo: mapped_host_buffer=0x%lx mapped_out=0x%lx elements=%u", mapped_input_u64, out_arg.data,
+        out_arg.shapes[0]
+    );
+
+    Arg params;
+    params.add_inout(mapped_host_buffer);
+    params.add_output(mapped_out);
+    pto2_rt_submit_aiv_task(0, params);
+}
+
+}  // extern "C"
diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/README.md b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/README.md
@@ -0,0 +1,59 @@
+# A5 Host Register Mapped Demo
+
+这个 demo 用来验证 `a5` 平台上的两件事：
+- `mallocHostDeviceShareMem(...)` 可以在 Host 侧申请并注册一段 Device 可访问地址
+- AIV kernel 可以直接读取并写回这段映射内存
+
+## 本次 a5 修改点
+
+- 在 `src/a5/platform/onboard/host/pto_runtime_c_api.cpp` 中实现了：
+  - `mallocHostDeviceShareMem(...)`
+  - `freeHostDeviceShareMem(...)`
+- 这两个接口的执行顺序和 `a2a3` 保持一致：
+  - `GetDevice / SetDevice`
+  - `MallocHost / FreeHost`
+  - `HostRegister / HostUnregister`
+- Python 侧继续复用通用封装：
+  - `malloc_host_device_share_mem(...)`
+  - `free_host_device_share_mem(...)`
+- 新增了 `examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo` 用于硬件验证
+
+## Demo 行为
+
+- Host 通过 `malloc_host_device_share_mem(...)` 拿到：
+  - `host_ptr`
+  - `mapped_dev_ptr`
+- Host 把 `host_ptr` 初始化为 `0, 1, 2, ...`
+- orchestration 把 `mapped_dev_ptr` 包成外部 tensor
+- kernel 执行：
+  - `mapped_host_buffer[i] = mapped_host_buffer[i] + 1`
+  - `mapped_out[i] = mapped_host_buffer[i] + 1`
+- 运行结束后打印：
+  - 初始 Host 数据
+  - 执行后 Host 内存数据
+  - 普通 output copy-back 数据
+
+## 启动命令
+
+在仓库根目录执行：
+
+```bash
+python examples/scripts/run_example.py --build \
+  -k examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels \
+  -g examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py \
+  -p a5 -d 0
+```
+
+如果环境已经提前编好了 runtime，也可以去掉 `--build`。
+
+## 结果判断
+
+成功时建议重点看三组日志：
+- `a5_host_register_mapped_demo: host_init_data`
+- `a5_host_register_mapped_demo: host_data_after_run`
+- `a5_host_register_mapped_demo: device_copy_back_data`
+
+理想结果是：
+- `host_init_data` 为 `0, 1, 2, ...`
+- `host_data_after_run` 为 `1, 2, 3, ...`
+- `device_copy_back_data` 也为 `1, 2, 3, ...`