hw-native-sys · indigo1973 · Apr 15, 2026
diff --git a/conftest.py b/conftest.py
@@ -84,7 +84,12 @@ def pytest_addoption(parser):
         "--skip-golden", action="store_true", default=False, help="Skip golden comparison (benchmark mode)"
     )
     parser.addoption(
-        "--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"
+        "--enable-profiling",
+        type=int,
+        nargs="?",
+        const=3,
+        default=0,
+        help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
     )
     parser.addoption("--dump-tensor", action="store_true", default=False, help="Dump per-task tensor I/O at runtime")
     parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")

diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py
@@ -139,8 +139,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
 
     parser.add_argument(
         "--enable-profiling",
-        action="store_true",
-        help="Enable profiling and generate swimlane.json",
+        type=int,
+        nargs="?",
+        const=3,
+        default=0,
+        metavar="LEVEL",
+        help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
     )
 
     parser.add_argument(

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
@@ -540,12 +540,24 @@ NB_MODULE(_task_interface, m) {
         .def(nb::init<>())
         .def_rw("block_dim", &ChipCallConfig::block_dim)
         .def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
-        .def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
+        .def_prop_rw(
+            "enable_profiling",
+            [](const ChipCallConfig &self) {
+                return self.perf_level;
+            },
+            [](ChipCallConfig &self, nb::object v) {
+                if (nb::isinstance<nb::bool_>(v)) {
+                    self.perf_level = nb::cast<bool>(v) ? 3 : 0;
+                } else {
+                    self.perf_level = nb::cast<int>(v);
+                }
+            }
+        )
         .def_rw("enable_dump_tensor", &ChipCallConfig::enable_dump_tensor)
         .def("__repr__", [](const ChipCallConfig &self) -> std::string {
             std::ostringstream os;
             os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
-               << ", enable_profiling=" << (self.enable_profiling ? "True" : "False")
+               << ", enable_profiling=" << self.perf_level
                << ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") << ")";
             return os.str();
         });
@@ -571,29 +583,29 @@ NB_MODULE(_task_interface, m) {
         .def(
             "run_raw",
             [](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
-               bool enable_profiling) {
+               int perf_level) {
                 ChipCallConfig config;
                 config.block_dim = block_dim;
                 config.aicpu_thread_num = aicpu_thread_num;
-                config.enable_profiling = enable_profiling;
+                config.perf_level = perf_level;
                 self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
             },
             nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
-            nb::arg("enable_profiling") = false, "Run with a raw ChipStorageTaskArgs POD pointer."
+            nb::arg("perf_level") = 0, "Run with a raw ChipStorageTaskArgs POD pointer."
         )
         .def(
             "run_from_blob",
             [](ChipWorker &self, uint64_t callable, uint64_t blob_ptr, int block_dim, int aicpu_thread_num,
-               bool enable_profiling) {
+               int perf_level) {
                 ChipCallConfig config;
                 config.block_dim = block_dim;
                 config.aicpu_thread_num = aicpu_thread_num;
-                config.enable_profiling = enable_profiling;
+                config.perf_level = perf_level;
                 TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(blob_ptr));
                 self.run(callable, view, config);
             },
             nb::arg("callable"), nb::arg("blob_ptr"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
-            nb::arg("enable_profiling") = false,
+            nb::arg("perf_level") = 0,
             "Decode a length-prefixed TaskArgs blob ([T][S][tensors][scalars]) at "
             "blob_ptr and dispatch to the runtime. Used from forked chip processes "
             "reading the WorkerThread mailbox."

diff --git a/python/simpler/worker.py b/python/simpler/worker.py
@@ -161,7 +161,7 @@ def _chip_process_loop(
 
             error = 0
             try:
-                cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, bool(profiling))
+                cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, profiling)
             except Exception:  # noqa: BLE001
                 error = 1
             struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)

diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py
@@ -123,6 +123,12 @@ def _load_module_from_path(module_path: Path, module_name: str):
     return module
 
 
+def _normalize_perf_level(v) -> int:
+    if isinstance(v, bool):
+        return 3 if v else 0
+    return int(v)
+
+
 def _kernel_config_runtime_env(kernel_config_module, kernels_dir: Path) -> dict[str, str]:
     """
     Optional per-example environment variables for runtime compilation.
@@ -192,7 +198,7 @@ def __init__(  # noqa: PLR0913
         golden_path: str,
         device_id: Optional[int] = None,
         platform: str = "a2a3",
-        enable_profiling: bool = False,
+        enable_profiling: int = 0,
         enable_dump_tensor: bool = False,
         run_all_cases: bool = False,
         case_name: Optional[str] = None,
@@ -212,7 +218,7 @@ def __init__(  # noqa: PLR0913
         self.kernels_dir = Path(kernels_dir).resolve()
         self.golden_path = Path(golden_path).resolve()
         self.platform = platform
-        self.enable_profiling = enable_profiling
+        self._perf_level = _normalize_perf_level(enable_profiling)
         self.enable_dump_tensor = enable_dump_tensor
         self.skip_golden = skip_golden
         self.project_root = PROJECT_ROOT
@@ -607,9 +613,9 @@ def _compile_one_kernel(kernel):
                 config = ChipCallConfig()
                 config.block_dim = self.block_dim
                 config.aicpu_thread_num = self.aicpu_thread_num
-                if self.enable_profiling and round_idx == 0:
-                    config.enable_profiling = True
-                    logger.info("Profiling enabled")
+                if self._perf_level > 0 and round_idx == 0:
+                    config.enable_profiling = self._perf_level
+                    logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")
                 if self.enable_dump_tensor:
                     config.enable_dump_tensor = True
                     logger.info("Dump tensor enabled")

diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
@@ -700,7 +700,7 @@ def build_callable(self, platform):
             return self._compile_l3_callables(platform)
         raise ValueError(f"Unsupported level: {self._st_level}")
 
-    def _build_config(self, config_dict, enable_profiling=False, enable_dump_tensor=False):
+    def _build_config(self, config_dict, enable_profiling=0, enable_dump_tensor=False):
         from simpler.task_interface import ChipCallConfig  # noqa: PLC0415
 
         config = ChipCallConfig()
@@ -791,7 +791,7 @@ def _run_and_validate_l2(
 
             config = self._build_config(
                 config_dict,
-                enable_profiling=(enable_profiling and round_idx == 0),
+                enable_profiling=(enable_profiling if round_idx == 0 else 0),
                 enable_dump_tensor=enable_dump_tensor,
             )
 
@@ -847,7 +847,7 @@ def _run_and_validate_l3(
 
             config = self._build_config(
                 config_dict,
-                enable_profiling=(enable_profiling and round_idx == 0),
+                enable_profiling=(enable_profiling if round_idx == 0 else 0),
                 enable_dump_tensor=enable_dump_tensor,
             )
 
@@ -948,7 +948,15 @@ def run_module(module_name):
         )
         parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)")
         parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)")
-        parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)")
+        parser.add_argument(
+            "--enable-profiling",
+            type=int,
+            nargs="?",
+            const=3,
+            default=0,
+            metavar="LEVEL",
+            help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
+        )
         parser.add_argument("--dump-tensor", action="store_true", help="Dump per-task tensor I/O at runtime")
         parser.add_argument("--build", action="store_true", help="Compile runtime from source")
         parser.add_argument(

diff --git a/src/a2a3/platform/include/host/performance_collector.h b/src/a2a3/platform/include/host/performance_collector.h
@@ -332,6 +332,12 @@ class PerformanceCollector {
      */
     bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }
 
+    /**
+     * Set profiling level before initialize().
+     * 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
+     */
+    void set_perf_level(int level) { perf_level_ = level; }
+
     /**
      * Drain remaining buffers from the memory manager's ready queue
      *
@@ -387,6 +393,9 @@ class PerformanceCollector {
     PerfRegisterCallback register_cb_{nullptr};
     PerfFreeCallback free_cb_{nullptr};
 
+    // Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
+    int perf_level_{0};
+
     // Memory manager
     ProfMemoryManager memory_manager_;
 

diff --git a/src/a2a3/platform/include/host/runtime_profiling_mode.h b/src/a2a3/platform/include/host/runtime_profiling_mode.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
+ *
+ * Used by both onboard and sim pto_runtime_c_api.cpp implementations.
+ * Some runtime structs still carry a bool enable_profiling member alongside
+ * the newer int perf_level.  This template detects the legacy member at
+ * compile time and keeps both in sync.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+template <typename T, typename = void>
+struct HasEnableProfilingMember : std::false_type {};
+
+template <typename T>
+struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};
+
+template <typename R>
+static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
+    runtime->perf_level = enable_profiling;
+    if constexpr (HasEnableProfilingMember<R>::value) {
+        runtime->enable_profiling = (enable_profiling > 0);
+    }
+}
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -461,7 +461,7 @@ int DeviceRunner::run(
     });
 
     // Initialize performance profiling if enabled
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         rc = init_performance_profiling(runtime, num_aicore, device_id);
         if (rc != 0) {
             LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -540,18 +540,18 @@ int DeviceRunner::run(
     {
         // Poll and collect performance data in a separate collector thread
         std::thread collector_thread;
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             collector_thread = create_thread([this, &runtime]() {
                 poll_and_collect_performance_data(runtime.get_task_count());
             });
         }
         auto thread_guard = RAIIScopeGuard([&]() {
-            if (runtime.enable_profiling && collector_thread.joinable()) {
+            if (runtime.perf_level > 0 && collector_thread.joinable()) {
                 collector_thread.join();
             }
         });
         auto collector_signal_guard = RAIIScopeGuard([this, &runtime]() {
-            if (runtime.enable_profiling) {
+            if (runtime.perf_level > 0) {
                 perf_collector_.signal_execution_complete();
             }
         });
@@ -588,7 +588,7 @@ int DeviceRunner::run(
     }
 
     // Stop memory management, drain remaining buffers, collect phase data, export
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         perf_collector_.stop_memory_manager();
         perf_collector_.drain_remaining_buffers();
         perf_collector_.scan_remaining_perf_buffers();
@@ -872,6 +872,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
         return rtFree(dev_ptr);
     };
 
+    perf_collector_.set_perf_level(runtime.perf_level);
     return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
 }
 

diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -26,6 +26,7 @@
 #include "common/unified_log.h"
 #include "device_runner.h"
 #include "host/raii_scope_guard.h"
+#include "host/runtime_profiling_mode.h"
 #include "runtime.h"
 
 extern "C" {
@@ -162,9 +163,7 @@ int run_runtime(
             return rc;
         }
 
-        if (enable_profiling) {
-            r->enable_profiling = true;
-        }
+        set_runtime_profiling_mode(r, enable_profiling);
 
         std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
         std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);

diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -314,7 +314,7 @@ int DeviceRunner::run(
     last_runtime_ = &runtime;
 
     // Initialize performance profiling if enabled
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         rc = init_performance_profiling(runtime, num_aicore, device_id);
         if (rc != 0) {
             LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -420,7 +420,7 @@ int DeviceRunner::run(
 
     // Poll and collect performance data during execution (if enabled)
     std::thread collector_thread;
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         collector_thread = create_thread([this, &runtime]() {
             poll_and_collect_performance_data(runtime.get_task_count());
         });
@@ -442,13 +442,13 @@ int DeviceRunner::run(
         }
 
         // Signal collector that device execution is complete
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             perf_collector_.signal_execution_complete();
         }
         dump_collector_.signal_execution_complete();
 
         // Wait for collector thread if it was launched
-        if (runtime.enable_profiling && collector_thread.joinable()) {
+        if (runtime.perf_level > 0 && collector_thread.joinable()) {
             collector_thread.join();
         }
         if (dump_collector_thread.joinable()) {
@@ -465,12 +465,12 @@ int DeviceRunner::run(
         }
 
         // Signal collector that device execution is complete
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             perf_collector_.signal_execution_complete();
         }
 
         // Wait for collector thread if it was launched
-        if (runtime.enable_profiling && collector_thread.joinable()) {
+        if (runtime.perf_level > 0 && collector_thread.joinable()) {
             collector_thread.join();
         }
     }
@@ -484,7 +484,7 @@ int DeviceRunner::run(
     }
 
     // Stop memory management, drain remaining buffers, collect phase data, export
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         perf_collector_.stop_memory_manager();
         perf_collector_.drain_remaining_buffers();
         perf_collector_.scan_remaining_perf_buffers();
@@ -710,6 +710,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
 // =============================================================================
 
 int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, int device_id) {
+    perf_collector_.set_perf_level(runtime.perf_level);
     // Define allocation callback (a2a3sim: use malloc)
     auto alloc_cb = [](size_t size) -> void * {
         return malloc(size);