diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py
new file mode 100644
index 000000000..231c3b405
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py
@@ -0,0 +1,109 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Golden script for the host_register_mapped demo."""
+
+import atexit
+import ctypes
+import logging
+
+import numpy as np
+import torch
+
+from simpler.task_interface import free_host_device_share_mem, malloc_host_device_share_mem
+
+logger = logging.getLogger(__name__)
+
+__outputs__ = ["mapped_out"]
+
+RTOL = 1e-5
+ATOL = 1e-5
+LOG_PREVIEW_COUNT = 16
+
+ROWS = 128
+COLS = 128
+SIZE = ROWS * COLS
+
+_MAPPED_STATE = {}
+
+
+def _log_preview(label: str, values) -> None:
+    flat = np.asarray(values).reshape(-1)
+    preview = flat[:LOG_PREVIEW_COUNT].tolist()
+    logger.info("%s first_%d=%s total=%d", label, min(LOG_PREVIEW_COUNT, flat.size), preview, flat.size)
+
+
+def _cleanup_mapped_state() -> None:
+    host_ptr = _MAPPED_STATE.get("host_ptr", 0)
+    if not host_ptr:
+        _MAPPED_STATE.clear()
+        return
+
+    try:
+        free_host_device_share_mem(host_ptr)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("free_host_device_share_mem cleanup failed: %s", exc)
+
+    _MAPPED_STATE.clear()
+
+
+atexit.register(_cleanup_mapped_state)
+
+
+def generate_inputs(params: dict) -> list:
+    del params
+    _cleanup_mapped_state()
+
+    alloc_size = SIZE * ctypes.sizeof(ctypes.c_float)
+    host_ptr, mapped_dev_ptr = malloc_host_device_share_mem(alloc_size)
+    host_buf = (ctypes.c_float * SIZE).from_address(host_ptr)
+    host_np = np.ctypeslib.as_array(host_buf)
+    host_np[:] = np.arange(SIZE, dtype=np.float32)
+    host_tensor = torch.from_numpy(host_np)
+    _log_preview("host_register_mapped_demo: host_init_data", host_np)
+
+    mapped_out = torch.zeros_like(host_tensor)
+
+    _MAPPED_STATE.update(
+        {
+            "host_ptr": host_ptr,
+            "mapped_dev_ptr": mapped_dev_ptr,
+            "host_buf": host_buf,
+            "host_np": host_np,
+            "host_tensor": host_tensor,
+        }
+    )
+
+    logger.info(
+        "host_register_mapped_demo: host_ptr=0x%x mapped_dev_ptr=0x%x size=%d",
+        host_ptr,
+        mapped_dev_ptr,
+        host_tensor.numel() * host_tensor.element_size(),
+    )
+
+    return [
+        ("mapped_out", mapped_out),
+        ("mapped_dev_ptr", ctypes.c_uint64(mapped_dev_ptr)),
+    ]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    del params
+    host_tensor = _MAPPED_STATE["host_tensor"]
+    tensors["mapped_out"][:] = host_tensor + 1.0
+
+
+def post_run_collect(outputs: dict, params: dict) -> None:
+    del params
+    host_np = _MAPPED_STATE.get("host_np")
+    if host_np is not None:
+        _log_preview("host_register_mapped_demo: host_data_after_run", host_np)
+    mapped_out = outputs.get("mapped_out")
+    if mapped_out is not None:
+        _log_preview("host_register_mapped_demo: device_copy_back_data", mapped_out.detach().cpu().numpy())
+    _cleanup_mapped_state()
diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp
new file mode 100644
index 000000000..7287da767
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Demo kernel:
+ *   mapped_host_buffer[i] = mapped_host_buffer[i] + 1.0f
+ *   out[i] = mapped_host_buffer[i] + 1.0f
+ *
+ * The input pointer comes from host_register_mapped(), so a successful result
+ * shows that the kernel was able to read and write the mapped host buffer
+ * directly while also producing a regular copy-back output.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"  // NOLINT(build/include_subdir)
+
+using namespace pto;  // NOLINT(build/namespaces)
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mapped_host_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *mapped_host = reinterpret_cast<__gm__ float *>(mapped_host_tensor->buffer.addr) + mapped_host_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    constexpr float kAddValue = 1.0f;
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData src_tile(vRows, vCols);
+    TileData dst_tile(vRows, vCols);
+    TASSIGN(src_tile, 0x0);
+    TASSIGN(dst_tile, 0x10000);
+
+    GlobalData mapped_host_global(mapped_host);
+    GlobalData dst_global(out);
+
+    TLOAD(src_tile, mapped_host_global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADDS(dst_tile, src_tile, kAddValue);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mapped_host_global, dst_tile);
+    TSTORE(dst_global, dst_tile);
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py
new file mode 100644
index 000000000..43551ae40
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py
@@ -0,0 +1,38 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Kernel config for the host_register_mapped demo."""
+
+from pathlib import Path
+
+from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
+
+_KERNELS_ROOT = Path(__file__).parent
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "host_register_mapped_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+    # ChipStorageTaskArgs stores tensors first and scalars after them.
+    "signature": [D.OUT, D.SCALAR],
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "source": str(_KERNELS_ROOT / "aiv" / "kernel_load_add_one.cpp"),
+        "core_type": "aiv",
+        "signature": [D.INOUT, D.OUT],
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "block_dim": 3,
+    "rounds": 1,
+}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp
new file mode 100644
index 000000000..f9c9a87ff
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Demo orchestration for host-side mapped memory.
+ *
+ * Args layout in ChipStorageTaskArgs:
+ *   tensor(0): mapped_out (host tensor copied back by runtime)
+ *   scalar(0): mapped_dev_ptr (device-visible address returned by host_register_mapped)
+ *
+ * The mapped host buffer is wrapped as an external tensor and submitted as
+ * INOUT so the kernel updates host-visible memory in place.
+ */
+
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 2,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+    const ContinuousTensor &out_arg = orch_args.tensor(0);
+    Tensor mapped_out = from_tensor_arg(out_arg);
+
+    uint64_t mapped_input_u64 = orch_args.scalar(0);
+    Tensor mapped_host_buffer = make_tensor_external(
+        reinterpret_cast<void *>(static_cast<uintptr_t>(mapped_input_u64)), out_arg.shapes, out_arg.ndims, out_arg.dtype
+    );
+
+    LOG_INFO(
+        "host_register_mapped_demo: mapped_host_buffer=0x%lx mapped_out=0x%lx elements=%u", mapped_input_u64, out_arg.data,
+        out_arg.shapes[0]
+    );
+
+    Arg params;
+    params.add_inout(mapped_host_buffer);
+    params.add_output(mapped_out);
+    pto2_rt_submit_aiv_task(0, params);
+}
+
+}  // extern "C"
diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/README.md b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/README.md
new file mode 100644
index 000000000..c05c73fa9
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/README.md
@@ -0,0 +1,59 @@
+# A5 Host Register Mapped Demo
+
+这个 demo 用来验证 `a5` 平台上的两件事：
+- `mallocHostDeviceShareMem(...)` 可以在 Host 侧申请并注册一段 Device 可访问地址
+- AIV kernel 可以直接读取并写回这段映射内存
+
+## 本次 a5 修改点
+
+- 在 `src/a5/platform/onboard/host/pto_runtime_c_api.cpp` 中实现了：
+  - `mallocHostDeviceShareMem(...)`
+  - `freeHostDeviceShareMem(...)`
+- 这两个接口的执行顺序和 `a2a3` 保持一致：
+  - `GetDevice / SetDevice`
+  - `MallocHost / FreeHost`
+  - `HostRegister / HostUnregister`
+- Python 侧继续复用通用封装：
+  - `malloc_host_device_share_mem(...)`
+  - `free_host_device_share_mem(...)`
+- 新增了 `examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo` 用于硬件验证
+
+## Demo 行为
+
+- Host 通过 `malloc_host_device_share_mem(...)` 拿到：
+  - `host_ptr`
+  - `mapped_dev_ptr`
+- Host 把 `host_ptr` 初始化为 `0, 1, 2, ...`
+- orchestration 把 `mapped_dev_ptr` 包成外部 tensor
+- kernel 执行：
+  - `mapped_host_buffer[i] = mapped_host_buffer[i] + 1`
+  - `mapped_out[i] = mapped_host_buffer[i] + 1`
+- 运行结束后打印：
+  - 初始 Host 数据
+  - 执行后 Host 内存数据
+  - 普通 output copy-back 数据
+
+## 启动命令
+
+在仓库根目录执行：
+
+```bash
+python examples/scripts/run_example.py --build \
+  -k examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels \
+  -g examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py \
+  -p a5 -d 0
+```
+
+如果环境已经提前编好了 runtime，也可以去掉 `--build`。
+
+## 结果判断
+
+成功时建议重点看三组日志：
+- `a5_host_register_mapped_demo: host_init_data`
+- `a5_host_register_mapped_demo: host_data_after_run`
+- `a5_host_register_mapped_demo: device_copy_back_data`
+
+理想结果是：
+- `host_init_data` 为 `0, 1, 2, ...`
+- `host_data_after_run` 为 `1, 2, 3, ...`
+- `device_copy_back_data` 也为 `1, 2, 3, ...`
diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py
new file mode 100644
index 000000000..21ad1fbec
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/golden.py
@@ -0,0 +1,109 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Golden script for the a5 host_register_mapped demo."""
+
+import atexit
+import ctypes
+import logging
+
+import numpy as np
+import torch
+
+from simpler.task_interface import free_host_device_share_mem, malloc_host_device_share_mem
+
+logger = logging.getLogger(__name__)
+
+__outputs__ = ["mapped_out"]
+
+RTOL = 1e-5
+ATOL = 1e-5
+LOG_PREVIEW_COUNT = 16
+
+ROWS = 128
+COLS = 128
+SIZE = ROWS * COLS
+
+_MAPPED_STATE = {}
+
+
+def _log_preview(label: str, values) -> None:
+    flat = np.asarray(values).reshape(-1)
+    preview = flat[:LOG_PREVIEW_COUNT].tolist()
+    logger.info("%s first_%d=%s total=%d", label, min(LOG_PREVIEW_COUNT, flat.size), preview, flat.size)
+
+
+def _cleanup_mapped_state() -> None:
+    host_ptr = _MAPPED_STATE.get("host_ptr", 0)
+    if not host_ptr:
+        _MAPPED_STATE.clear()
+        return
+
+    try:
+        free_host_device_share_mem(host_ptr)
+    except Exception as exc:  # noqa: BLE001
+        logger.warning("free_host_device_share_mem cleanup failed: %s", exc)
+
+    _MAPPED_STATE.clear()
+
+
+atexit.register(_cleanup_mapped_state)
+
+
+def generate_inputs(params: dict) -> list:
+    del params
+    _cleanup_mapped_state()
+
+    alloc_size = SIZE * ctypes.sizeof(ctypes.c_float)
+    host_ptr, mapped_dev_ptr = malloc_host_device_share_mem(alloc_size)
+    host_buf = (ctypes.c_float * SIZE).from_address(host_ptr)
+    host_np = np.ctypeslib.as_array(host_buf)
+    host_np[:] = np.arange(SIZE, dtype=np.float32)
+    host_tensor = torch.from_numpy(host_np)
+    _log_preview("a5_host_register_mapped_demo: host_init_data", host_np)
+
+    mapped_out = torch.zeros_like(host_tensor)
+
+    _MAPPED_STATE.update(
+        {
+            "host_ptr": host_ptr,
+            "mapped_dev_ptr": mapped_dev_ptr,
+            "host_buf": host_buf,
+            "host_np": host_np,
+            "host_tensor": host_tensor,
+        }
+    )
+
+    logger.info(
+        "a5_host_register_mapped_demo: host_ptr=0x%x mapped_dev_ptr=0x%x size=%d",
+        host_ptr,
+        mapped_dev_ptr,
+        host_tensor.numel() * host_tensor.element_size(),
+    )
+
+    return [
+        ("mapped_out", mapped_out),
+        ("mapped_dev_ptr", ctypes.c_uint64(mapped_dev_ptr)),
+    ]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    del params
+    host_tensor = _MAPPED_STATE["host_tensor"]
+    tensors["mapped_out"][:] = host_tensor + 1.0
+
+
+def post_run_collect(outputs: dict, params: dict) -> None:
+    del params
+    host_np = _MAPPED_STATE.get("host_np")
+    if host_np is not None:
+        _log_preview("a5_host_register_mapped_demo: host_data_after_run", host_np)
+    mapped_out = outputs.get("mapped_out")
+    if mapped_out is not None:
+        _log_preview("a5_host_register_mapped_demo: device_copy_back_data", mapped_out.detach().cpu().numpy())
+    _cleanup_mapped_state()
diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp
new file mode 100644
index 000000000..79e41e6d8
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/aiv/kernel_load_add_one.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Demo kernel:
+ *   mapped_host_buffer[i] = mapped_host_buffer[i] + 1.0f
+ *   out[i] = mapped_host_buffer[i] + 1.0f
+ *
+ * The input pointer comes from mallocHostDeviceShareMem(), so a successful result
+ * shows that the kernel was able to read and write the mapped host buffer
+ * directly while also producing a regular copy-back output.
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"  // NOLINT(build/include_subdir)
+
+using namespace pto;  // NOLINT(build/namespaces)
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#endif
+
+extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) {
+    __gm__ Tensor *mapped_host_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
+    __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
+    __gm__ float *mapped_host = reinterpret_cast<__gm__ float *>(mapped_host_tensor->buffer.addr) + mapped_host_tensor->start_offset;
+    __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    constexpr float kAddValue = 1.0f;
+    constexpr int kTRows_ = 128;
+    constexpr int kTCols_ = 128;
+    constexpr int vRows = 128;
+    constexpr int vCols = 128;
+
+    using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>;
+    using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, kTRows_, kTCols_, BLayout::RowMajor, -1, -1>;
+
+    TileData src_tile(vRows, vCols);
+    TileData dst_tile(vRows, vCols);
+    TASSIGN(src_tile, 0x0);
+    TASSIGN(dst_tile, 0x10000);
+
+    GlobalData mapped_host_global(mapped_host);
+    GlobalData dst_global(out);
+
+    TLOAD(src_tile, mapped_host_global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADDS(dst_tile, src_tile, kAddValue);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mapped_host_global, dst_tile);
+    TSTORE(dst_global, dst_tile);
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py
new file mode 100644
index 000000000..d50929064
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/kernel_config.py
@@ -0,0 +1,37 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Kernel config for the a5 host_register_mapped demo."""
+
+from pathlib import Path
+
+from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
+
+_KERNELS_ROOT = Path(__file__).parent
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "host_register_mapped_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+    "signature": [D.OUT, D.SCALAR],
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "source": str(_KERNELS_ROOT / "aiv" / "kernel_load_add_one.cpp"),
+        "core_type": "aiv",
+        "signature": [D.INOUT, D.OUT],
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "block_dim": 3,
+    "rounds": 1,
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp
new file mode 100644
index 000000000..7fe235e45
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/host_register_mapped_demo/kernels/orchestration/host_register_mapped_orch.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * Demo orchestration for host-side mapped memory on a5.
+ *
+ * Args layout in ChipStorageTaskArgs:
+ *   tensor(0): mapped_out (host tensor copied back by runtime)
+ *   scalar(0): mapped_dev_ptr (device-visible address returned by mallocHostDeviceShareMem)
+ */
+
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
+
+extern "C" {
+
+__attribute__((visibility("default"))) PTO2OrchestrationConfig
+aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
+    (void)orch_args;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 2,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+    const ContinuousTensor &out_arg = orch_args.tensor(0);
+    Tensor mapped_out = from_tensor_arg(out_arg);
+
+    uint64_t mapped_input_u64 = orch_args.scalar(0);
+    Tensor mapped_host_buffer = make_tensor_external(
+        reinterpret_cast<void *>(static_cast<uintptr_t>(mapped_input_u64)), out_arg.shapes, out_arg.ndims, out_arg.dtype
+    );
+
+    LOG_INFO(
+        "a5_host_register_mapped_demo: mapped_host_buffer=0x%lx mapped_out=0x%lx elements=%u",
+        mapped_input_u64, out_arg.data, out_arg.shapes[0]
+    );
+
+    Arg params;
+    params.add_inout(mapped_host_buffer);
+    params.add_output(mapped_out);
+    pto2_rt_submit_aiv_task(0, params);
+}
+
+}  // extern "C"
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index d949735c3..5df201593 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -560,6 +560,23 @@ NB_MODULE(_task_interface, m) {
         .def("set_device", &ChipWorker::set_device, nb::arg("device_id"))
         .def("reset_device", &ChipWorker::reset_device)
         .def("finalize", &ChipWorker::finalize)
+        .def(
+            "malloc_host_device_share_mem",
+            [](ChipWorker &self, uint64_t size, int device_id) -> nb::tuple {
+                uint64_t host_ptr = 0;
+                uint64_t dev_ptr = 0;
+                self.mallocHostDeviceShareMem(size, &host_ptr, &dev_ptr, device_id);
+                nb::list lst;
+                lst.append(host_ptr);
+                lst.append(dev_ptr);
+                return nb::tuple(lst);
+            },
+            nb::arg("size"), nb::arg("device_id") = -1
+        )
+        .def(
+            "free_host_device_share_mem", &ChipWorker::freeHostDeviceShareMem, nb::arg("host_ptr"),
+            nb::arg("device_id") = -1
+        )
         .def(
             "run",
             [](ChipWorker &self, const PyChipCallable &callable, ChipStorageTaskArgs &args,
diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py
index d84fa8352..c23e5a1b6 100644
--- a/python/simpler/task_interface.py
+++ b/python/simpler/task_interface.py
@@ -57,6 +57,9 @@
     "torch_dtype_to_datatype",
     "make_tensor_arg",
     "scalar_to_uint64",
+    "get_active_worker",
+    "malloc_host_device_share_mem",
+    "free_host_device_share_mem",
     # Distributed runtime
     "WorkerType",
     "TaskState",
@@ -69,6 +72,7 @@
 
 # Lazy-loaded torch dtype → DataType map (avoids importing torch at module load)
 _TORCH_DTYPE_MAP = None
+_ACTIVE_WORKER = None
 
 
 def _ensure_torch_map():
@@ -156,7 +160,9 @@ class ChipWorker:
     """
 
     def __init__(self):
+        global _ACTIVE_WORKER
         self._impl = _ChipWorker()
+        _ACTIVE_WORKER = self
 
     def init(self, host_path, aicpu_path, aicore_path, sim_context_lib_path=""):
         """Load host runtime library and cache platform binaries.
@@ -185,12 +191,28 @@ def reset_device(self):
         """Release device resources. The runtime binding remains intact."""
         self._impl.reset_device()
 
+    def malloc_host_device_share_mem(self, size, device_id=None):
+        """Allocate host memory and register it as a device-visible mapped buffer."""
+        if device_id is None:
+            device_id = self.device_id
+        host_ptr, dev_ptr = self._impl.malloc_host_device_share_mem(int(size), int(device_id))
+        return int(host_ptr), int(dev_ptr)
+
+    def free_host_device_share_mem(self, host_ptr, device_id=None):
+        """Unregister and free a mapped host buffer."""
+        if device_id is None:
+            device_id = self.device_id
+        self._impl.free_host_device_share_mem(int(host_ptr), int(device_id))
+
     def finalize(self):
         """Tear down everything: device resources and runtime library.
 
         Terminal operation — the object cannot be reused after this.
         """
+        global _ACTIVE_WORKER
         self._impl.finalize()
+        if _ACTIVE_WORKER is self:
+            _ACTIVE_WORKER = None
 
     def run(self, callable, args, config=None, **kwargs):
         """Execute a callable synchronously.
@@ -218,3 +240,24 @@ def initialized(self):
     @property
     def device_set(self):
         return self._impl.device_set
+
+
+def get_active_worker():
+    """Return the most recently created ChipWorker in this process."""
+    if _ACTIVE_WORKER is None:
+        raise RuntimeError("No active ChipWorker is available")
+    if not _ACTIVE_WORKER.initialized:
+        raise RuntimeError("The active ChipWorker is not initialized")
+    if not _ACTIVE_WORKER.device_set:
+        raise RuntimeError("The active ChipWorker does not have a device set")
+    return _ACTIVE_WORKER
+
+
+def malloc_host_device_share_mem(size, device_id=None):
+    """Allocate host memory and register it as a device-visible mapped buffer."""
+    return get_active_worker().malloc_host_device_share_mem(size, device_id=device_id)
+
+
+def free_host_device_share_mem(host_ptr, device_id=None):
+    """Unregister and free a mapped host buffer."""
+    get_active_worker().free_host_device_share_mem(host_ptr, device_id=device_id)
diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py
index 9f51f2d10..0790fbdb1 100644
--- a/simpler_setup/code_runner.py
+++ b/simpler_setup/code_runner.py
@@ -542,81 +542,103 @@ def _compile_one_kernel(kernel):
         )
         worker.set_device(self.device_id)
 
-        # Step 3: Run each parameter set
-        total_cases = len(self.params_list)
-        for case_idx, params in enumerate(self.params_list):
-            logger.info("=" * 60)
-            logger.info(f"=== Case {case_idx + 1}/{total_cases}: {params} ===")
-            logger.info("=" * 60)
-
-            # Generate tensors using golden.py
-            logger.info("=== Generating Inputs ===")
-            result = self._golden_module.generate_inputs(params)
-
-            if isinstance(result, list):
-                # New-style: generate_inputs returns flat argument list
-                orch_args, args, inputs, outputs = self._build_func_args_from_list(result)
-                tensors = args  # args contains all named items; compute_golden receives all
-            else:
-                # Legacy: generate_inputs returns dict of tensors
-                tensors = {k: _to_torch(v) for k, v in result.items()}
-                orch_args = self._build_func_args(tensors)
-                inputs, outputs = self._identify_outputs(tensors)
-
-            logger.info(f"Inputs: {list(inputs.keys())}")
-            logger.info(f"Outputs: {list(outputs.keys())}")
-
-            # Determine actual tensor order for debugging
-            logger.debug(f"Tensor order: {list(tensors.keys())}")
-            logger.debug(f"orch_args count: {len(orch_args)}")
-
-            # Build environment for runtime initialization
-            run_env = _kernel_config_runtime_env(self._kernel_config, self.kernels_dir)
-            if run_env:
-                logger.debug(f"Runtime init env overrides: {run_env}")
-
-            # Golden
-            if not self.skip_golden:
-                golden = {k: v.clone() for k, v in outputs.items()}
-                golden_with_inputs = {**inputs, **golden}
-                _t_golden_start = time.perf_counter()
-                self._golden_module.compute_golden(golden_with_inputs, params)
-                _t_golden_end = time.perf_counter()
-                logger.info(f">>> compute_golden() took {_t_golden_end - _t_golden_start:.3f}s")
-
-            initial_outputs = {k: v.clone() for k, v in outputs.items()}
-
-            for round_idx in range(self.repeat_rounds):
-                if self.repeat_rounds > 1:
-                    logger.info(f"--- Round {round_idx + 1}/{self.repeat_rounds} ---")
-
-                for k, v in initial_outputs.items():
-                    outputs[k].copy_(v)
-
-                config = ChipCallConfig()
-                config.block_dim = self.block_dim
-                config.aicpu_thread_num = self.aicpu_thread_num
-                if self.enable_profiling and round_idx == 0:
-                    config.enable_profiling = True
-                    logger.info("Profiling enabled")
-                if self.enable_dump_tensor:
-                    config.enable_dump_tensor = True
-                    logger.info("Dump tensor enabled")
-
-                with _temporary_env(run_env):
-                    worker.run(chip_callable, orch_args, config)
-
-                if not self.skip_golden:
-                    self._compare_with_golden(outputs, golden)
-
-            logger.info(f"=== Case {case_idx + 1}/{total_cases} Passed ===")
-
-        worker.reset_device()
-        worker.finalize()
+        try:
+            # Step 3: Run each parameter set
+            total_cases = len(self.params_list)
+            for case_idx, params in enumerate(self.params_list):
+                logger.info("=" * 60)
+                logger.info(f"=== Case {case_idx + 1}/{total_cases}: {params} ===")
+                logger.info("=" * 60)
+
+                outputs = {}
+                try:
+                    # Generate tensors using golden.py
+                    logger.info("=== Generating Inputs ===")
+                    result = self._golden_module.generate_inputs(params)
+
+                    if isinstance(result, list):
+                        # New-style: generate_inputs returns flat argument list
+                        orch_args, args, inputs, outputs = self._build_func_args_from_list(result)
+                        tensors = args  # args contains all named items; compute_golden receives all
+                    else:
+                        # Legacy: generate_inputs returns dict of tensors
+                        tensors = {k: _to_torch(v) for k, v in result.items()}
+                        orch_args = self._build_func_args(tensors)
+                        inputs, outputs = self._identify_outputs(tensors)
+
+                    logger.info(f"Inputs: {list(inputs.keys())}")
+                    logger.info(f"Outputs: {list(outputs.keys())}")
+
+                    # Determine actual tensor order for debugging
+                    logger.debug(f"Tensor order: {list(tensors.keys())}")
+                    logger.debug(f"orch_args count: {len(orch_args)}")
+
+                    # Build environment for runtime initialization
+                    run_env = _kernel_config_runtime_env(self._kernel_config, self.kernels_dir)
+                    if run_env:
+                        logger.debug(f"Runtime init env overrides: {run_env}")
+
+                    # Golden
+                    if not self.skip_golden:
+                        golden = {k: v.clone() for k, v in outputs.items()}
+                        golden_with_inputs = {**inputs, **golden}
+                        _t_golden_start = time.perf_counter()
+                        self._golden_module.compute_golden(golden_with_inputs, params)
+                        _t_golden_end = time.perf_counter()
+                        logger.info(f">>> compute_golden() took {_t_golden_end - _t_golden_start:.3f}s")
+
+                    initial_outputs = {k: v.clone() for k, v in outputs.items()}
+
+                    for round_idx in range(self.repeat_rounds):
+                        if self.repeat_rounds > 1:
+                            logger.info(f"--- Round {round_idx + 1}/{self.repeat_rounds} ---")
+
+                        for k, v in initial_outputs.items():
+                            outputs[k].copy_(v)
+
+                        config = ChipCallConfig()
+                        config.block_dim = self.block_dim
+                        config.aicpu_thread_num = self.aicpu_thread_num
+                        if self.enable_profiling and round_idx == 0:
+                            config.enable_profiling = True
+                            logger.info("Profiling enabled")
+                        if self.enable_dump_tensor:
+                            config.enable_dump_tensor = True
+                            logger.info("Dump tensor enabled")
+
+                        with _temporary_env(run_env):
+                            worker.run(chip_callable, orch_args, config)
+
+                        if not self.skip_golden:
+                            self._compare_with_golden(outputs, golden)
+
+                    logger.info(f"=== Case {case_idx + 1}/{total_cases} Passed ===")
+                finally:
+                    self._run_post_run_collect(outputs, params)
+        finally:
+            worker.reset_device()
+            worker.finalize()
+
         logger.info("=" * 60)
         logger.info(f"=== All {total_cases} cases passed ===")
         logger.info("=" * 60)
 
+    def _run_post_run_collect(
+        self,
+        outputs: dict[str, torch.Tensor],
+        params: dict[str, Any],
+    ) -> None:
+        """
+        Optional post-run hook.
+
+        If golden.py defines post_run_collect(outputs, params), call it after
+        the case completes so custom cleanup hooks can release external state.
+        """
+        collect_fn = getattr(self._golden_module, "post_run_collect", None)
+        if not callable(collect_fn):
+            return
+        collect_fn(outputs, params)
+
     def _compare_with_golden(
         self,
         outputs: dict[str, torch.Tensor],
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index 2808152eb..c12a663d8 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -20,6 +20,9 @@
 #include "callable.h"
 #include "task_args.h"
 
+#include <inttypes.h>
+#include <dlfcn.h>
+#include <initializer_list>
 #include <pthread.h>
 #include <vector>
 
@@ -28,6 +31,88 @@
 #include "host/raii_scope_guard.h"
 #include "runtime.h"
 
+namespace {
+
+using RtMallocHostFn = int (*)(void **, uint64_t, uint32_t);
+using RtFreeHostFn = int (*)(void *);
+using RtHostRegisterFn = int (*)(void *, uint64_t, uint32_t, void **);
+using RtHostUnregisterFn = int (*)(void *);
+using AclMallocHostFn = int (*)(void **, size_t);
+using AclFreeHostFn = int (*)(void *);
+using AclHostRegisterFn = int (*)(void *, uint64_t, uint32_t, void **);
+using AclHostUnregisterFn = int (*)(void *);
+using GetDeviceFn = int (*)(int32_t *);
+using DirectSetDeviceFn = int (*)(int32_t);
+
+template <typename Fn>
+Fn resolve_symbol(const char **resolved_name, std::initializer_list<const char *> names) {
+    for (const char *name : names) {
+        dlerror();
+        void *sym = dlsym(RTLD_DEFAULT, name);
+        const char *err = dlerror();
+        if (err == nullptr && sym != nullptr) {
+            if (resolved_name != nullptr) {
+                *resolved_name = name;
+            }
+            return reinterpret_cast<Fn>(sym);
+        }
+    }
+    if (resolved_name != nullptr) {
+        *resolved_name = nullptr;
+    }
+    return nullptr;
+}
+
+static constexpr uint32_t kHostRegisterMappedFlag =
+#if defined(RT_HOST_REGISTER_MAPPED)
+    RT_HOST_REGISTER_MAPPED;
+#elif defined(ACL_HOST_REGISTER_MAPPED)
+    ACL_HOST_REGISTER_MAPPED;
+#else
+    0U;
+#endif
+
+int ensure_current_device_for_share_mem(uint32_t device_id) {
+    const char *symbol_name = nullptr;
+    if (GetDeviceFn get_device_fn = resolve_symbol<GetDeviceFn>(&symbol_name, {"aclrtGetDevice", "rtGetDevice"})) {
+        int32_t current_device = -1;
+        int rc = get_device_fn(&current_device);
+        if (rc != 0) {
+            LOG_INFO(
+                "ensure_current_device_for_share_mem: %s failed rc=%d, trying to set device to %u",
+                symbol_name, rc, device_id
+            );
+            if (DirectSetDeviceFn set_device_fn =
+                    resolve_symbol<DirectSetDeviceFn>(&symbol_name, {"rtSetDevice", "aclrtSetDevice"})) {
+                rc = set_device_fn(static_cast<int32_t>(device_id));
+                if (rc != 0) {
+                    LOG_ERROR(
+                        "ensure_current_device_for_share_mem: %s(%u) failed: rc=%d", symbol_name, device_id, rc
+                    );
+                    return rc;
+                }
+                return 0;
+            }
+            LOG_ERROR("ensure_current_device_for_share_mem: missing symbols rtSetDevice / aclrtSetDevice");
+            return rc;
+        }
+
+        if (current_device != static_cast<int32_t>(device_id)) {
+            LOG_ERROR(
+                "ensure_current_device_for_share_mem: current device %d does not match requested device %u",
+                static_cast<int>(current_device), device_id
+            );
+            return -1;
+        }
+        return 0;
+    }
+
+    LOG_ERROR("ensure_current_device_for_share_mem: missing symbols aclrtGetDevice / rtGetDevice");
+    return -1;
+}
+
+}  // namespace
+
 extern "C" {
 
 /* ===========================================================================
@@ -114,8 +199,141 @@ void destroy_device_context(DeviceContextHandle ctx) { delete static_cast<Device
 size_t get_runtime_size(void) { return sizeof(Runtime); }
 
 int set_device(DeviceContextHandle ctx, int device_id) {
-    (void)ctx;
-    (void)device_id;
+    if (ctx == NULL) return -1;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->attach_current_thread(device_id);
+    } catch (...) {
+        return -1;
+    }
+}
+
+int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr) {
+    if (hostPtr == NULL || devPtr == NULL || size == 0) {
+        return -1;
+    }
+
+    *hostPtr = nullptr;
+    *devPtr = nullptr;
+
+    int rc = ensure_current_device_for_share_mem(deviceId);
+    if (rc != 0) {
+        return rc;
+    }
+
+    void *allocated_host_ptr = nullptr;
+    const char *symbol_name = nullptr;
+    if (RtMallocHostFn malloc_fn = resolve_symbol<RtMallocHostFn>(&symbol_name, {"rtMallocHost"})) {
+        rc = malloc_fn(&allocated_host_ptr, size, 0U);
+        if (rc != 0 || allocated_host_ptr == nullptr) {
+            LOG_ERROR("mallocHostDeviceShareMem via %s failed on rtMallocHost: rc=%d size=%" PRIu64, symbol_name, rc, size);
+            return (rc != 0) ? rc : -1;
+        }
+    } else if (AclMallocHostFn malloc_fn = resolve_symbol<AclMallocHostFn>(&symbol_name, {"aclrtMallocHost"})) {
+        rc = malloc_fn(&allocated_host_ptr, static_cast<size_t>(size));
+        if (rc != 0 || allocated_host_ptr == nullptr) {
+            LOG_ERROR("mallocHostDeviceShareMem via %s failed on aclrtMallocHost: rc=%d size=%" PRIu64, symbol_name, rc, size);
+            return (rc != 0) ? rc : -1;
+        }
+    } else {
+        LOG_ERROR("mallocHostDeviceShareMem: missing symbols rtMallocHost / aclrtMallocHost");
+        return -1;
+    }
+
+    if (RtHostRegisterFn register_fn =
+            resolve_symbol<RtHostRegisterFn>(&symbol_name, {"rtsHostRegister", "rtHostRegister"})) {
+        rc = register_fn(allocated_host_ptr, size, kHostRegisterMappedFlag, devPtr);
+        if (rc != 0 || *devPtr == nullptr) {
+            LOG_ERROR(
+                "mallocHostDeviceShareMem via %s failed on host register: rc=%d host=%p size=%" PRIu64 " flag=%u",
+                symbol_name, rc, allocated_host_ptr, size, kHostRegisterMappedFlag
+            );
+            if (RtFreeHostFn free_fn = resolve_symbol<RtFreeHostFn>(nullptr, {"rtFreeHost"})) {
+                free_fn(allocated_host_ptr);
+            } else if (AclFreeHostFn free_fn = resolve_symbol<AclFreeHostFn>(nullptr, {"aclrtFreeHost"})) {
+                free_fn(allocated_host_ptr);
+            }
+            return (rc != 0) ? rc : -1;
+        }
+    } else if (AclHostRegisterFn register_fn =
+                   resolve_symbol<AclHostRegisterFn>(&symbol_name, {"aclrtHostRegister"})) {
+        rc = register_fn(allocated_host_ptr, size, kHostRegisterMappedFlag, devPtr);
+        if (rc != 0 || *devPtr == nullptr) {
+            LOG_ERROR(
+                "mallocHostDeviceShareMem via %s failed on host register: rc=%d host=%p size=%" PRIu64 " flag=%u",
+                symbol_name, rc, allocated_host_ptr, size, kHostRegisterMappedFlag
+            );
+            if (RtFreeHostFn free_fn = resolve_symbol<RtFreeHostFn>(nullptr, {"rtFreeHost"})) {
+                free_fn(allocated_host_ptr);
+            } else if (AclFreeHostFn free_fn = resolve_symbol<AclFreeHostFn>(nullptr, {"aclrtFreeHost"})) {
+                free_fn(allocated_host_ptr);
+            }
+            return (rc != 0) ? rc : -1;
+        }
+    } else {
+        LOG_ERROR("mallocHostDeviceShareMem: missing symbols rtsHostRegister / rtHostRegister / aclrtHostRegister");
+        if (RtFreeHostFn free_fn = resolve_symbol<RtFreeHostFn>(nullptr, {"rtFreeHost"})) {
+            free_fn(allocated_host_ptr);
+        } else if (AclFreeHostFn free_fn = resolve_symbol<AclFreeHostFn>(nullptr, {"aclrtFreeHost"})) {
+            free_fn(allocated_host_ptr);
+        }
+        return -1;
+    }
+
+    *hostPtr = allocated_host_ptr;
+    LOG_INFO(
+        "mallocHostDeviceShareMem: device=%u host=%p dev=%p size=%" PRIu64, deviceId, *hostPtr, *devPtr, size
+    );
+    return 0;
+}
+
+int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr) {
+    if (hostPtr == NULL) {
+        return 0;
+    }
+
+    int rc = ensure_current_device_for_share_mem(deviceId);
+    if (rc != 0) {
+        return rc;
+    }
+
+    const char *symbol_name = nullptr;
+    if (RtHostUnregisterFn unregister_fn =
+            resolve_symbol<RtHostUnregisterFn>(&symbol_name, {"rtsHostUnregister", "rtHostUnregister"})) {
+        rc = unregister_fn(hostPtr);
+        if (rc != 0) {
+            LOG_ERROR("freeHostDeviceShareMem via %s failed on unregister: rc=%d host=%p", symbol_name, rc, hostPtr);
+            return rc;
+        }
+    } else if (AclHostUnregisterFn unregister_fn =
+                   resolve_symbol<AclHostUnregisterFn>(&symbol_name, {"aclrtHostUnregister"})) {
+        rc = unregister_fn(hostPtr);
+        if (rc != 0) {
+            LOG_ERROR("freeHostDeviceShareMem via %s failed on unregister: rc=%d host=%p", symbol_name, rc, hostPtr);
+            return rc;
+        }
+    } else {
+        LOG_ERROR("freeHostDeviceShareMem: missing symbols rtsHostUnregister / rtHostUnregister / aclrtHostUnregister");
+        return -1;
+    }
+
+    if (RtFreeHostFn free_fn = resolve_symbol<RtFreeHostFn>(&symbol_name, {"rtFreeHost"})) {
+        rc = free_fn(hostPtr);
+        if (rc != 0) {
+            LOG_ERROR("freeHostDeviceShareMem via %s failed on free: rc=%d host=%p", symbol_name, rc, hostPtr);
+            return rc;
+        }
+    } else if (AclFreeHostFn free_fn = resolve_symbol<AclFreeHostFn>(&symbol_name, {"aclrtFreeHost"})) {
+        rc = free_fn(hostPtr);
+        if (rc != 0) {
+            LOG_ERROR("freeHostDeviceShareMem via %s failed on free: rc=%d host=%p", symbol_name, rc, hostPtr);
+            return rc;
+        }
+    } else {
+        LOG_ERROR("freeHostDeviceShareMem: missing symbols rtFreeHost / aclrtFreeHost");
+        return -1;
+    }
+
+    LOG_INFO("freeHostDeviceShareMem: device=%u host=%p", deviceId, hostPtr);
     return 0;
 }
 
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 3e7dfd89e..4d6fc6fa6 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -121,6 +121,26 @@ int set_device(DeviceContextHandle ctx, int device_id) {
     return 0;
 }
 
+int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr) {
+    (void)deviceId;
+    (void)size;
+    if (hostPtr != nullptr) {
+        *hostPtr = nullptr;
+    }
+    if (devPtr != nullptr) {
+        *devPtr = nullptr;
+    }
+    LOG_ERROR("mallocHostDeviceShareMem is not supported on a2a3sim");
+    return -1;
+}
+
+int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr) {
+    (void)deviceId;
+    (void)hostPtr;
+    LOG_ERROR("freeHostDeviceShareMem is not supported on a2a3sim");
+    return -1;
+}
+
 int run_runtime(
     DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
     int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
index 2808152eb..8275c69af 100644
--- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp
@@ -20,6 +20,9 @@
 #include "callable.h"
 #include "task_args.h"
 
+#include <inttypes.h>
+#include <dlfcn.h>
+#include <initializer_list>
 #include <pthread.h>
 #include <vector>
 
@@ -28,6 +31,88 @@
 #include "host/raii_scope_guard.h"
 #include "runtime.h"
 
+namespace {
+
+using RtMallocHostFn = int (*)(void **, uint64_t, uint32_t);
+using RtFreeHostFn = int (*)(void *);
+using RtHostRegisterFn = int (*)(void *, uint64_t, uint32_t, void **);
+using RtHostUnregisterFn = int (*)(void *);
+using AclMallocHostFn = int (*)(void **, size_t);
+using AclFreeHostFn = int (*)(void *);
+using AclHostRegisterFn = int (*)(void *, uint64_t, uint32_t, void **);
+using AclHostUnregisterFn = int (*)(void *);
+using GetDeviceFn = int (*)(int32_t *);
+using DirectSetDeviceFn = int (*)(int32_t);
+
+template <typename Fn>
+Fn resolve_symbol(const char **resolved_name, std::initializer_list<const char *> names) {
+    for (const char *name : names) {
+        dlerror();
+        void *sym = dlsym(RTLD_DEFAULT, name);
+        const char *err = dlerror();
+        if (err == nullptr && sym != nullptr) {
+            if (resolved_name != nullptr) {
+                *resolved_name = name;
+            }
+            return reinterpret_cast<Fn>(sym);
+        }
+    }
+    if (resolved_name != nullptr) {
+        *resolved_name = nullptr;
+    }
+    return nullptr;
+}
+
+static constexpr uint32_t kHostRegisterMappedFlag =
+#if defined(RT_HOST_REGISTER_MAPPED)
+    RT_HOST_REGISTER_MAPPED;
+#elif defined(ACL_HOST_REGISTER_MAPPED)
+    ACL_HOST_REGISTER_MAPPED;
+#else
+    0U;
+#endif
+
+int ensure_current_device_for_share_mem(uint32_t device_id) {
+    const char *symbol_name = nullptr;
+    if (GetDeviceFn get_device_fn = resolve_symbol<GetDeviceFn>(&symbol_name, {"aclrtGetDevice", "rtGetDevice"})) {
+        int32_t current_device = -1;
+        int rc = get_device_fn(&current_device);
+        if (rc != 0) {
+            LOG_INFO(
+                "ensure_current_device_for_share_mem: %s failed rc=%d, trying to set device to %u",
+                symbol_name, rc, device_id
+            );
+            if (DirectSetDeviceFn set_device_fn =
+                    resolve_symbol<DirectSetDeviceFn>(&symbol_name, {"rtSetDevice", "aclrtSetDevice"})) {
+                rc = set_device_fn(static_cast<int32_t>(device_id));
+                if (rc != 0) {
+                    LOG_ERROR(
+                        "ensure_current_device_for_share_mem: %s(%u) failed: rc=%d", symbol_name, device_id, rc
+                    );
+                    return rc;
+                }
+                return 0;
+            }
+            LOG_ERROR("ensure_current_device_for_share_mem: missing symbols rtSetDevice / aclrtSetDevice");
+            return rc;
+        }
+
+        if (current_device != static_cast<int32_t>(device_id)) {
+            LOG_ERROR(
+                "ensure_current_device_for_share_mem: current device %d does not match requested device %u",
+                static_cast<int>(current_device), device_id
+            );
+            return -1;
+        }
+        return 0;
+    }
+
+    LOG_ERROR("ensure_current_device_for_share_mem: missing symbols aclrtGetDevice / rtGetDevice");
+    return -1;
+}
+
+}  // namespace
+
 extern "C" {
 
 /* ===========================================================================
@@ -114,8 +199,144 @@ void destroy_device_context(DeviceContextHandle ctx) { delete static_cast<Device
 size_t get_runtime_size(void) { return sizeof(Runtime); }
 
 int set_device(DeviceContextHandle ctx, int device_id) {
-    (void)ctx;
-    (void)device_id;
+    if (ctx == NULL) return -1;
+    try {
+        return static_cast<DeviceRunner *>(ctx)->attach_current_thread(device_id);
+    } catch (...) {
+        return -1;
+    }
+}
+
+int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr) {
+    if (hostPtr == NULL || devPtr == NULL || size == 0) {
+        return -1;
+    }
+
+    *hostPtr = nullptr;
+    *devPtr = nullptr;
+
+    int rc = ensure_current_device_for_share_mem(deviceId);
+    if (rc != 0) {
+        return rc;
+    }
+
+    void *allocated_host_ptr = nullptr;
+    const char *symbol_name = nullptr;
+    if (RtMallocHostFn malloc_fn = resolve_symbol<RtMallocHostFn>(&symbol_name, {"rtMallocHost"})) {
+        rc = malloc_fn(&allocated_host_ptr, size, 0U);
+        if (rc != 0 || allocated_host_ptr == nullptr) {
+            LOG_ERROR("mallocHostDeviceShareMem via %s failed on rtMallocHost: rc=%d size=%" PRIu64, symbol_name, rc, size);
+            return (rc != 0) ? rc : -1;
+        }
+    } else if (AclMallocHostFn malloc_fn = resolve_symbol<AclMallocHostFn>(&symbol_name, {"aclrtMallocHost"})) {
+        rc = malloc_fn(&allocated_host_ptr, static_cast<size_t>(size));
+        if (rc != 0 || allocated_host_ptr == nullptr) {
+            LOG_ERROR(
+                "mallocHostDeviceShareMem via %s failed on aclrtMallocHost: rc=%d size=%" PRIu64, symbol_name, rc,
+                size
+            );
+            return (rc != 0) ? rc : -1;
+        }
+    } else {
+        LOG_ERROR("mallocHostDeviceShareMem: missing symbols rtMallocHost / aclrtMallocHost");
+        return -1;
+    }
+
+    if (RtHostRegisterFn register_fn =
+            resolve_symbol<RtHostRegisterFn>(&symbol_name, {"rtsHostRegister", "rtHostRegister"})) {
+        rc = register_fn(allocated_host_ptr, size, kHostRegisterMappedFlag, devPtr);
+        if (rc != 0 || *devPtr == nullptr) {
+            LOG_ERROR(
+                "mallocHostDeviceShareMem via %s failed on host register: rc=%d host=%p size=%" PRIu64 " flag=%u",
+                symbol_name, rc, allocated_host_ptr, size, kHostRegisterMappedFlag
+            );
+            if (RtFreeHostFn free_fn = resolve_symbol<RtFreeHostFn>(nullptr, {"rtFreeHost"})) {
+                free_fn(allocated_host_ptr);
+            } else if (AclFreeHostFn free_fn = resolve_symbol<AclFreeHostFn>(nullptr, {"aclrtFreeHost"})) {
+                free_fn(allocated_host_ptr);
+            }
+            return (rc != 0) ? rc : -1;
+        }
+    } else if (AclHostRegisterFn register_fn =
+                   resolve_symbol<AclHostRegisterFn>(&symbol_name, {"aclrtHostRegister"})) {
+        rc = register_fn(allocated_host_ptr, size, kHostRegisterMappedFlag, devPtr);
+        if (rc != 0 || *devPtr == nullptr) {
+            LOG_ERROR(
+                "mallocHostDeviceShareMem via %s failed on host register: rc=%d host=%p size=%" PRIu64 " flag=%u",
+                symbol_name, rc, allocated_host_ptr, size, kHostRegisterMappedFlag
+            );
+            if (RtFreeHostFn free_fn = resolve_symbol<RtFreeHostFn>(nullptr, {"rtFreeHost"})) {
+                free_fn(allocated_host_ptr);
+            } else if (AclFreeHostFn free_fn = resolve_symbol<AclFreeHostFn>(nullptr, {"aclrtFreeHost"})) {
+                free_fn(allocated_host_ptr);
+            }
+            return (rc != 0) ? rc : -1;
+        }
+    } else {
+        LOG_ERROR("mallocHostDeviceShareMem: missing symbols rtsHostRegister / rtHostRegister / aclrtHostRegister");
+        if (RtFreeHostFn free_fn = resolve_symbol<RtFreeHostFn>(nullptr, {"rtFreeHost"})) {
+            free_fn(allocated_host_ptr);
+        } else if (AclFreeHostFn free_fn = resolve_symbol<AclFreeHostFn>(nullptr, {"aclrtFreeHost"})) {
+            free_fn(allocated_host_ptr);
+        }
+        return -1;
+    }
+
+    *hostPtr = allocated_host_ptr;
+    LOG_INFO(
+        "mallocHostDeviceShareMem: device=%u host=%p dev=%p size=%" PRIu64, deviceId, *hostPtr, *devPtr, size
+    );
+    return 0;
+}
+
+int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr) {
+    if (hostPtr == NULL) {
+        return 0;
+    }
+
+    int rc = ensure_current_device_for_share_mem(deviceId);
+    if (rc != 0) {
+        return rc;
+    }
+
+    const char *symbol_name = nullptr;
+    if (RtHostUnregisterFn unregister_fn =
+            resolve_symbol<RtHostUnregisterFn>(&symbol_name, {"rtsHostUnregister", "rtHostUnregister"})) {
+        rc = unregister_fn(hostPtr);
+        if (rc != 0) {
+            LOG_ERROR("freeHostDeviceShareMem via %s failed on unregister: rc=%d host=%p", symbol_name, rc, hostPtr);
+            return rc;
+        }
+    } else if (AclHostUnregisterFn unregister_fn =
+                   resolve_symbol<AclHostUnregisterFn>(&symbol_name, {"aclrtHostUnregister"})) {
+        rc = unregister_fn(hostPtr);
+        if (rc != 0) {
+            LOG_ERROR("freeHostDeviceShareMem via %s failed on unregister: rc=%d host=%p", symbol_name, rc, hostPtr);
+            return rc;
+        }
+    } else {
+        LOG_ERROR("freeHostDeviceShareMem: missing symbols rtsHostUnregister / rtHostUnregister / aclrtHostUnregister");
+        return -1;
+    }
+
+    if (RtFreeHostFn free_fn = resolve_symbol<RtFreeHostFn>(&symbol_name, {"rtFreeHost"})) {
+        rc = free_fn(hostPtr);
+        if (rc != 0) {
+            LOG_ERROR("freeHostDeviceShareMem via %s failed on free: rc=%d host=%p", symbol_name, rc, hostPtr);
+            return rc;
+        }
+    } else if (AclFreeHostFn free_fn = resolve_symbol<AclFreeHostFn>(&symbol_name, {"aclrtFreeHost"})) {
+        rc = free_fn(hostPtr);
+        if (rc != 0) {
+            LOG_ERROR("freeHostDeviceShareMem via %s failed on free: rc=%d host=%p", symbol_name, rc, hostPtr);
+            return rc;
+        }
+    } else {
+        LOG_ERROR("freeHostDeviceShareMem: missing symbols rtFreeHost / aclrtFreeHost");
+        return -1;
+    }
+
+    LOG_INFO("freeHostDeviceShareMem: device=%u host=%p", deviceId, hostPtr);
     return 0;
 }
 
diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
index 3e7dfd89e..52f1f57c6 100644
--- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp
@@ -121,6 +121,26 @@ int set_device(DeviceContextHandle ctx, int device_id) {
     return 0;
 }
 
+int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr) {
+    (void)deviceId;
+    (void)size;
+    if (hostPtr != nullptr) {
+        *hostPtr = nullptr;
+    }
+    if (devPtr != nullptr) {
+        *devPtr = nullptr;
+    }
+    LOG_ERROR("mallocHostDeviceShareMem is not supported on a5sim");
+    return -1;
+}
+
+int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr) {
+    (void)deviceId;
+    (void)hostPtr;
+    LOG_ERROR("freeHostDeviceShareMem is not supported on a5sim");
+    return -1;
+}
+
 int run_runtime(
     DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim,
     int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary,
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index e919fcc27..e86bf601e 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -37,6 +37,14 @@ T load_symbol(void *handle, const char *name) {
     return reinterpret_cast<T>(sym);
 }
 
+template <typename T>
+T try_load_symbol(void *handle, const char *name) {
+    dlerror();  // clear any existing error
+    void *sym = dlsym(handle, name);
+    (void)dlerror();
+    return reinterpret_cast<T>(sym);
+}
+
 // Process-wide singleton: libcpu_sim_context.so is loaded once with
 // RTLD_GLOBAL so that host_runtime.so can resolve sim_context_set_* and
 // pto_sim_get_* symbols at runtime.  Never dlclosed.
@@ -117,6 +125,9 @@ void ChipWorker::init(
         get_runtime_size_fn_ = load_symbol<GetRuntimeSizeFn>(handle, "get_runtime_size");
         run_runtime_fn_ = load_symbol<RunRuntimeFn>(handle, "run_runtime");
         finalize_device_fn_ = load_symbol<FinalizeDeviceFn>(handle, "finalize_device");
+        malloc_host_device_share_mem_fn_ =
+            try_load_symbol<MallocHostDeviceShareMemFn>(handle, "mallocHostDeviceShareMem");
+        free_host_device_share_mem_fn_ = try_load_symbol<FreeHostDeviceShareMemFn>(handle, "freeHostDeviceShareMem");
     } catch (...) {
         dlclose(handle);
         throw;
@@ -164,6 +175,49 @@ void ChipWorker::reset_device() {
     device_set_ = false;
 }
 
+void ChipWorker::mallocHostDeviceShareMem(uint64_t size, uint64_t *host_ptr, uint64_t *dev_ptr, int device_id) {
+    if (!device_set_) {
+        throw std::runtime_error("ChipWorker device not set; call set_device() first");
+    }
+    if (host_ptr == nullptr || dev_ptr == nullptr) {
+        throw std::runtime_error("mallocHostDeviceShareMem requires non-null output pointers");
+    }
+    if (malloc_host_device_share_mem_fn_ == nullptr) {
+        throw std::runtime_error("mallocHostDeviceShareMem symbol is not available in the bound runtime");
+    }
+
+    *host_ptr = 0;
+    *dev_ptr = 0;
+    int effective_device_id = (device_id >= 0) ? device_id : device_id_;
+    void *host_ptr_raw = nullptr;
+    void *dev_ptr_raw = nullptr;
+    int rc = malloc_host_device_share_mem_fn_(static_cast<uint32_t>(effective_device_id), size, &host_ptr_raw, &dev_ptr_raw);
+    if (rc != 0 || host_ptr_raw == nullptr || dev_ptr_raw == nullptr) {
+        throw std::runtime_error("mallocHostDeviceShareMem failed with code " + std::to_string(rc));
+    }
+
+    *host_ptr = reinterpret_cast<uint64_t>(host_ptr_raw);
+    *dev_ptr = reinterpret_cast<uint64_t>(dev_ptr_raw);
+}
+
+void ChipWorker::freeHostDeviceShareMem(uint64_t host_ptr, int device_id) {
+    if (host_ptr == 0) {
+        return;
+    }
+    if (!device_set_) {
+        throw std::runtime_error("ChipWorker device not set; call set_device() first");
+    }
+    if (free_host_device_share_mem_fn_ == nullptr) {
+        throw std::runtime_error("freeHostDeviceShareMem symbol is not available in the bound runtime");
+    }
+
+    int effective_device_id = (device_id >= 0) ? device_id : device_id_;
+    int rc = free_host_device_share_mem_fn_(static_cast<uint32_t>(effective_device_id), reinterpret_cast<void *>(host_ptr));
+    if (rc != 0) {
+        throw std::runtime_error("freeHostDeviceShareMem failed with code " + std::to_string(rc));
+    }
+}
+
 void ChipWorker::finalize() {
     reset_device();
     if (device_ctx_ != nullptr && destroy_device_context_fn_ != nullptr) {
@@ -180,6 +234,8 @@ void ChipWorker::finalize() {
     get_runtime_size_fn_ = nullptr;
     run_runtime_fn_ = nullptr;
     finalize_device_fn_ = nullptr;
+    malloc_host_device_share_mem_fn_ = nullptr;
+    free_host_device_share_mem_fn_ = nullptr;
     runtime_buf_.clear();
     aicpu_binary_.clear();
     aicore_binary_.clear();
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index 2a436c821..155840a38 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -43,6 +43,12 @@ class ChipWorker : public IWorker {
     /// After this, set_device() can be called again with a new device ID.
     void reset_device();
 
+    /// Allocate host memory and register it as a device-visible mapped buffer.
+    void mallocHostDeviceShareMem(uint64_t size, uint64_t *host_ptr, uint64_t *dev_ptr, int device_id = -1);
+
+    /// Unregister and free a mapped host buffer.
+    void freeHostDeviceShareMem(uint64_t host_ptr, int device_id = -1);
+
     /// Tear down everything: device resources and runtime library.
     /// Terminal — the object cannot be reused after this.
     void finalize();
@@ -70,6 +76,8 @@ class ChipWorker : public IWorker {
         int, int
     );
     using FinalizeDeviceFn = int (*)(void *);
+    using MallocHostDeviceShareMemFn = int (*)(uint32_t, uint64_t, void **, void **);
+    using FreeHostDeviceShareMemFn = int (*)(uint32_t, void *);
 
     void *lib_handle_ = nullptr;
     CreateDeviceContextFn create_device_context_fn_ = nullptr;
@@ -78,6 +86,8 @@ class ChipWorker : public IWorker {
     GetRuntimeSizeFn get_runtime_size_fn_ = nullptr;
     RunRuntimeFn run_runtime_fn_ = nullptr;
     FinalizeDeviceFn finalize_device_fn_ = nullptr;
+    MallocHostDeviceShareMemFn malloc_host_device_share_mem_fn_ = nullptr;
+    FreeHostDeviceShareMemFn free_host_device_share_mem_fn_ = nullptr;
     void *device_ctx_ = nullptr;
 
     std::vector<uint8_t> runtime_buf_;
diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h
index f8a811d94..dc6c334ee 100644
--- a/src/common/worker/pto_runtime_c_api.h
+++ b/src/common/worker/pto_runtime_c_api.h
@@ -17,7 +17,8 @@
  *
  * Public API — resolved by ChipWorker via dlsym:
  *   create_device_context, destroy_device_context,
- *   get_runtime_size, set_device, run_runtime, finalize_device
+ *   get_runtime_size, set_device, run_runtime, finalize_device,
+ *   mallocHostDeviceShareMem, freeHostDeviceShareMem
  *
  * Memory management: caller allocates a buffer of get_runtime_size() bytes
  * and passes it to run_runtime(). Error codes: 0 = success, negative = error.
@@ -59,6 +60,26 @@ size_t get_runtime_size(void);
 /** Set the target device. Must be called before the first run_runtime(). */
 int set_device(DeviceContextHandle ctx, int device_id);
 
+/**
+ * Allocate host memory and register it as a device-visible mapped address.
+ *
+ * @param deviceId  Target device ID
+ * @param size      Size in bytes
+ * @param hostPtr   Output host pointer
+ * @param devPtr    Output mapped device-visible address
+ * @return 0 on success, negative on error
+ */
+int mallocHostDeviceShareMem(uint32_t deviceId, uint64_t size, void **hostPtr, void **devPtr);
+
+/**
+ * Unregister and free host memory previously created by mallocHostDeviceShareMem().
+ *
+ * @param deviceId  Target device ID
+ * @param hostPtr   Host pointer previously returned by mallocHostDeviceShareMem()
+ * @return 0 on success, negative on error
+ */
+int freeHostDeviceShareMem(uint32_t deviceId, void *hostPtr);
+
 /**
  * Build the task graph, execute on device, copy results back, and clean up.
  *