Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""Golden script for the host_register_mapped demo."""

import atexit
import ctypes
import logging

import numpy as np
import torch

from simpler.task_interface import free_host_device_share_mem, malloc_host_device_share_mem

logger = logging.getLogger(__name__)

__outputs__ = ["mapped_out"]

RTOL = 1e-5
ATOL = 1e-5
LOG_PREVIEW_COUNT = 16

ROWS = 128
COLS = 128
SIZE = ROWS * COLS

_MAPPED_STATE = {}


def _log_preview(label: str, values) -> None:
flat = np.asarray(values).reshape(-1)
preview = flat[:LOG_PREVIEW_COUNT].tolist()
logger.info("%s first_%d=%s total=%d", label, min(LOG_PREVIEW_COUNT, flat.size), preview, flat.size)


def _cleanup_mapped_state() -> None:
host_ptr = _MAPPED_STATE.get("host_ptr", 0)
if not host_ptr:
_MAPPED_STATE.clear()
return

try:
free_host_device_share_mem(host_ptr)
except Exception as exc: # noqa: BLE001
logger.warning("free_host_device_share_mem cleanup failed: %s", exc)

_MAPPED_STATE.clear()


atexit.register(_cleanup_mapped_state)


def generate_inputs(params: dict) -> list:
del params
_cleanup_mapped_state()

alloc_size = SIZE * ctypes.sizeof(ctypes.c_float)
host_ptr, mapped_dev_ptr = malloc_host_device_share_mem(alloc_size)
host_buf = (ctypes.c_float * SIZE).from_address(host_ptr)
host_np = np.ctypeslib.as_array(host_buf)
host_np[:] = np.arange(SIZE, dtype=np.float32)
host_tensor = torch.from_numpy(host_np)
_log_preview("host_register_mapped_demo: host_init_data", host_np)

mapped_out = torch.zeros_like(host_tensor)

_MAPPED_STATE.update(
{
"host_ptr": host_ptr,
"mapped_dev_ptr": mapped_dev_ptr,
"host_buf": host_buf,
"host_np": host_np,
"host_tensor": host_tensor,
}
)

logger.info(
"host_register_mapped_demo: host_ptr=0x%x mapped_dev_ptr=0x%x size=%d",
host_ptr,
mapped_dev_ptr,
host_tensor.numel() * host_tensor.element_size(),
)

return [
("mapped_out", mapped_out),
("mapped_dev_ptr", ctypes.c_uint64(mapped_dev_ptr)),
]


def compute_golden(tensors: dict, params: dict) -> None:
del params
host_tensor = _MAPPED_STATE["host_tensor"]
tensors["mapped_out"][:] = host_tensor + 1.0


def post_run_collect(outputs: dict, params: dict) -> None:
del params
host_np = _MAPPED_STATE.get("host_np")
if host_np is not None:
_log_preview("host_register_mapped_demo: host_data_after_run", host_np)
mapped_out = outputs.get("mapped_out")
if mapped_out is not None:
_log_preview("host_register_mapped_demo: device_copy_back_data", mapped_out.detach().cpu().numpy())
_cleanup_mapped_state()
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/
/**
* Demo kernel:
* mapped_host_buffer[i] = mapped_host_buffer[i] + 1.0f
* out[i] = mapped_host_buffer[i] + 1.0f
*
* The input pointer comes from host_register_mapped(), so a successful result
* shows that the kernel was able to read and write the mapped host buffer
* directly while also producing a regular copy-back output.
*/

#include <cstdint>
#include <pto/pto-inst.hpp>

#include "tensor.h" // NOLINT(build/include_subdir)

using namespace pto; // NOLINT(build/namespaces)

#ifndef __gm__
#define __gm__
#endif

#ifndef __aicore__
#define __aicore__ [aicore] // NOLINT(whitespace/braces)
#endif

extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
__gm__ Tensor *mapped_host_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
__gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
__gm__ float *mapped_host = reinterpret_cast<__gm__ float *>(mapped_host_tensor->buffer.addr) + mapped_host_tensor->start_offset;
__gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;

constexpr float kAddValue = 1.0f;
constexpr int kTRows_ = 128;
constexpr int kTCols_ = 128;
constexpr int vRows = 128;
constexpr int vCols = 128;

using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;

TileData src_tile(vRows, vCols);
TileData dst_tile(vRows, vCols);
TASSIGN(src_tile, 0x0);
TASSIGN(dst_tile, 0x10000);

GlobalData mapped_host_global(mapped_host);
GlobalData dst_global(out);

TLOAD(src_tile, mapped_host_global);
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
TADDS(dst_tile, src_tile, kAddValue);
set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
TSTORE(mapped_host_global, dst_tile);
TSTORE(dst_global, dst_tile);

set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""Kernel config for the host_register_mapped demo."""

from pathlib import Path

from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue]

_KERNELS_ROOT = Path(__file__).parent

ORCHESTRATION = {
"source": str(_KERNELS_ROOT / "orchestration" / "host_register_mapped_orch.cpp"),
"function_name": "aicpu_orchestration_entry",
# ChipStorageTaskArgs stores tensors first and scalars after them.
"signature": [D.OUT, D.SCALAR],
}

KERNELS = [
{
"func_id": 0,
"source": str(_KERNELS_ROOT / "aiv" / "kernel_load_add_one.cpp"),
"core_type": "aiv",
"signature": [D.INOUT, D.OUT],
},
]

RUNTIME_CONFIG = {
"runtime": "tensormap_and_ringbuffer",
"aicpu_thread_num": 4,
"block_dim": 3,
"rounds": 1,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/
/**
* Demo orchestration for host-side mapped memory.
*
* Args layout in ChipStorageTaskArgs:
* tensor(0): mapped_out (host tensor copied back by runtime)
* scalar(0): mapped_dev_ptr (device-visible address returned by host_register_mapped)
*
* The mapped host buffer is wrapped as an external tensor and submitted as
* INOUT so the kernel updates host-visible memory in place.
*/

#include <stdint.h>

#include "pto_orchestration_api.h" // NOLINT(build/include_subdir)

extern "C" {

__attribute__((visibility("default"))) PTO2OrchestrationConfig
aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
(void)orch_args;
return PTO2OrchestrationConfig{
.expected_arg_count = 2,
};
}

__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
const ContinuousTensor &out_arg = orch_args.tensor(0);
Tensor mapped_out = from_tensor_arg(out_arg);

uint64_t mapped_input_u64 = orch_args.scalar(0);
Tensor mapped_host_buffer = make_tensor_external(
reinterpret_cast<void *>(static_cast<uintptr_t>(mapped_input_u64)), out_arg.shapes, out_arg.ndims, out_arg.dtype
);

LOG_INFO(
"host_register_mapped_demo: mapped_host_buffer=0x%lx mapped_out=0x%lx elements=%u", mapped_input_u64, out_arg.data,
out_arg.shapes[0]
);

Arg params;
params.add_inout(mapped_host_buffer);
params.add_output(mapped_out);
pto2_rt_submit_aiv_task(0, params);
}

} // extern "C"
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# A5 Host Register Mapped Demo

这个 demo 用来验证 `a5` 平台上的两件事:
- `mallocHostDeviceShareMem(...)` 可以在 Host 侧申请并注册一段 Device 可访问地址
- AIV kernel 可以直接读取并写回这段映射内存

## 本次 a5 修改点

- 在 `src/a5/platform/onboard/host/pto_runtime_c_api.cpp` 中实现了:
- `mallocHostDeviceShareMem(...)`
- `freeHostDeviceShareMem(...)`
- 这两个接口的执行顺序和 `a2a3` 保持一致:
- `GetDevice / SetDevice`
- `MallocHost / FreeHost`
- `HostRegister / HostUnregister`
- Python 侧继续复用通用封装:
- `malloc_host_device_share_mem(...)`
- `free_host_device_share_mem(...)`
- 新增了 `examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo` 用于硬件验证

## Demo 行为

- Host 通过 `malloc_host_device_share_mem(...)` 拿到:
- `host_ptr`
- `mapped_dev_ptr`
- Host 把 `host_ptr` 初始化为 `0, 1, 2, ...`
- orchestration 把 `mapped_dev_ptr` 包成外部 tensor
- kernel 执行:
- `mapped_host_buffer[i] = mapped_host_buffer[i] + 1`
- `mapped_out[i] = mapped_host_buffer[i] + 1`
- 运行结束后打印:
- 初始 Host 数据
- 执行后 Host 内存数据
- 普通 output copy-back 数据

## 启动命令

在仓库根目录执行:

```bash
python examples/scripts/run_example.py --build \
-k examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels \
-g examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py \
-p a5 -d 0
```

如果环境已经提前编好了 runtime,也可以去掉 `--build`。

## 结果判断

成功时建议重点看三组日志:
- `a5_host_register_mapped_demo: host_init_data`
- `a5_host_register_mapped_demo: host_data_after_run`
- `a5_host_register_mapped_demo: device_copy_back_data`

理想结果是:
- `host_init_data` 为 `0, 1, 2, ...`
- `host_data_after_run` 为 `1, 2, 3, ...`
- `device_copy_back_data` 也为 `1, 2, 3, ...`
Loading
Loading