diff --git a/conftest.py b/conftest.py index 97e43d534..ca744493c 100644 --- a/conftest.py +++ b/conftest.py @@ -84,7 +84,12 @@ def pytest_addoption(parser): "--skip-golden", action="store_true", default=False, help="Skip golden comparison (benchmark mode)" ) parser.addoption( - "--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)" + "--enable-profiling", + type=int, + nargs="?", + const=3, + default=0, + help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)", ) parser.addoption("--dump-tensor", action="store_true", default=False, help="Dump per-task tensor I/O at runtime") parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source") diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py index 839b5600f..db58f9dcd 100644 --- a/examples/scripts/run_example.py +++ b/examples/scripts/run_example.py @@ -139,8 +139,12 @@ def compute_golden(tensors: dict, params: dict) -> None: parser.add_argument( "--enable-profiling", - action="store_true", - help="Enable profiling and generate swimlane.json", + type=int, + nargs="?", + const=3, + default=0, + metavar="LEVEL", + help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)", ) parser.add_argument( diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index d949735c3..148b66824 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -540,12 +540,24 @@ NB_MODULE(_task_interface, m) { .def(nb::init<>()) .def_rw("block_dim", &ChipCallConfig::block_dim) .def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num) - .def_rw("enable_profiling", &ChipCallConfig::enable_profiling) + .def_prop_rw( + "enable_profiling", + [](const ChipCallConfig &self) { + return self.perf_level; + }, + [](ChipCallConfig &self, nb::object v) { + if (nb::isinstance(v)) { + self.perf_level = nb::cast(v) ? 3 : 0; + } else { + self.perf_level = nb::cast(v); + } + } + ) .def_rw("enable_dump_tensor", &ChipCallConfig::enable_dump_tensor) .def("__repr__", [](const ChipCallConfig &self) -> std::string { std::ostringstream os; os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num - << ", enable_profiling=" << (self.enable_profiling ? "True" : "False") + << ", enable_profiling=" << self.perf_level << ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") << ")"; return os.str(); }); @@ -571,29 +583,29 @@ NB_MODULE(_task_interface, m) { .def( "run_raw", [](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num, - bool enable_profiling) { + int perf_level) { ChipCallConfig config; config.block_dim = block_dim; config.aicpu_thread_num = aicpu_thread_num; - config.enable_profiling = enable_profiling; + config.perf_level = perf_level; self.run(reinterpret_cast(callable), reinterpret_cast(args), config); }, nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3, - nb::arg("enable_profiling") = false, "Run with a raw ChipStorageTaskArgs POD pointer." + nb::arg("perf_level") = 0, "Run with a raw ChipStorageTaskArgs POD pointer." ) .def( "run_from_blob", [](ChipWorker &self, uint64_t callable, uint64_t blob_ptr, int block_dim, int aicpu_thread_num, - bool enable_profiling) { + int perf_level) { ChipCallConfig config; config.block_dim = block_dim; config.aicpu_thread_num = aicpu_thread_num; - config.enable_profiling = enable_profiling; + config.perf_level = perf_level; TaskArgsView view = read_blob(reinterpret_cast(blob_ptr)); self.run(callable, view, config); }, nb::arg("callable"), nb::arg("blob_ptr"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3, - nb::arg("enable_profiling") = false, + nb::arg("perf_level") = 0, "Decode a length-prefixed TaskArgs blob ([T][S][tensors][scalars]) at " "blob_ptr and dispatch to the runtime. Used from forked chip processes " "reading the WorkerThread mailbox." diff --git a/python/simpler/worker.py b/python/simpler/worker.py index d40d4c235..da2e0b62a 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -161,7 +161,7 @@ def _chip_process_loop( error = 0 try: - cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, bool(profiling)) + cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, profiling) except Exception: # noqa: BLE001 error = 1 struct.pack_into("i", buf, _CHIP_OFF_ERROR, error) diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py index a70f6096c..df761d117 100644 --- a/simpler_setup/code_runner.py +++ b/simpler_setup/code_runner.py @@ -123,6 +123,12 @@ def _load_module_from_path(module_path: Path, module_name: str): return module +def _normalize_perf_level(v) -> int: + if isinstance(v, bool): + return 3 if v else 0 + return int(v) + + def _kernel_config_runtime_env(kernel_config_module, kernels_dir: Path) -> dict[str, str]: """ Optional per-example environment variables for runtime compilation. @@ -192,7 +198,7 @@ def __init__( # noqa: PLR0913 golden_path: str, device_id: Optional[int] = None, platform: str = "a2a3", - enable_profiling: bool = False, + enable_profiling: int = 0, enable_dump_tensor: bool = False, run_all_cases: bool = False, case_name: Optional[str] = None, @@ -212,7 +218,7 @@ def __init__( # noqa: PLR0913 self.kernels_dir = Path(kernels_dir).resolve() self.golden_path = Path(golden_path).resolve() self.platform = platform - self.enable_profiling = enable_profiling + self._perf_level = _normalize_perf_level(enable_profiling) self.enable_dump_tensor = enable_dump_tensor self.skip_golden = skip_golden self.project_root = PROJECT_ROOT @@ -607,9 +613,9 @@ def _compile_one_kernel(kernel): config = ChipCallConfig() config.block_dim = self.block_dim config.aicpu_thread_num = self.aicpu_thread_num - if self.enable_profiling and round_idx == 0: - config.enable_profiling = True - logger.info("Profiling enabled") + if self._perf_level > 0 and round_idx == 0: + config.enable_profiling = self._perf_level + logger.info(f"Swimlane profiling enabled (mode={self._perf_level})") if self.enable_dump_tensor: config.enable_dump_tensor = True logger.info("Dump tensor enabled") diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py index 600fcab34..f4fd3db43 100644 --- a/simpler_setup/scene_test.py +++ b/simpler_setup/scene_test.py @@ -700,7 +700,7 @@ def build_callable(self, platform): return self._compile_l3_callables(platform) raise ValueError(f"Unsupported level: {self._st_level}") - def _build_config(self, config_dict, enable_profiling=False, enable_dump_tensor=False): + def _build_config(self, config_dict, enable_profiling=0, enable_dump_tensor=False): from simpler.task_interface import ChipCallConfig # noqa: PLC0415 config = ChipCallConfig() @@ -791,7 +791,7 @@ def _run_and_validate_l2( config = self._build_config( config_dict, - enable_profiling=(enable_profiling and round_idx == 0), + enable_profiling=(enable_profiling if round_idx == 0 else 0), enable_dump_tensor=enable_dump_tensor, ) @@ -847,7 +847,7 @@ def _run_and_validate_l3( config = self._build_config( config_dict, - enable_profiling=(enable_profiling and round_idx == 0), + enable_profiling=(enable_profiling if round_idx == 0 else 0), enable_dump_tensor=enable_dump_tensor, ) @@ -948,7 +948,15 @@ def run_module(module_name): ) parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)") parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)") - parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)") + parser.add_argument( + "--enable-profiling", + type=int, + nargs="?", + const=3, + default=0, + metavar="LEVEL", + help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)", + ) parser.add_argument("--dump-tensor", action="store_true", help="Dump per-task tensor I/O at runtime") parser.add_argument("--build", action="store_true", help="Compile runtime from source") parser.add_argument( diff --git a/src/a2a3/platform/include/host/performance_collector.h b/src/a2a3/platform/include/host/performance_collector.h index cf6a52e2b..98644abbf 100644 --- a/src/a2a3/platform/include/host/performance_collector.h +++ b/src/a2a3/platform/include/host/performance_collector.h @@ -332,6 +332,12 @@ class PerformanceCollector { */ bool is_initialized() const { return perf_shared_mem_host_ != nullptr; } + /** + * Set profiling level before initialize(). + * 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers) + */ + void set_perf_level(int level) { perf_level_ = level; } + /** * Drain remaining buffers from the memory manager's ready queue * @@ -387,6 +393,9 @@ class PerformanceCollector { PerfRegisterCallback register_cb_{nullptr}; PerfFreeCallback free_cb_{nullptr}; + // Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase) + int perf_level_{0}; + // Memory manager ProfMemoryManager memory_manager_; diff --git a/src/a2a3/platform/include/host/runtime_profiling_mode.h b/src/a2a3/platform/include/host/runtime_profiling_mode.h new file mode 100644 index 000000000..a3999a300 --- /dev/null +++ b/src/a2a3/platform/include/host/runtime_profiling_mode.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * Shared helper: set perf_level and legacy enable_profiling on a Runtime struct. + * + * Used by both onboard and sim pto_runtime_c_api.cpp implementations. + * Some runtime structs still carry a bool enable_profiling member alongside + * the newer int perf_level. This template detects the legacy member at + * compile time and keeps both in sync. + */ + +#pragma once + +#include + +template +struct HasEnableProfilingMember : std::false_type {}; + +template +struct HasEnableProfilingMember().enable_profiling)>> : std::true_type {}; + +template +static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) { + runtime->perf_level = enable_profiling; + if constexpr (HasEnableProfilingMember::value) { + runtime->enable_profiling = (enable_profiling > 0); + } +} diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index b3def91cc..0c9540ed9 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -461,7 +461,7 @@ int DeviceRunner::run( }); // Initialize performance profiling if enabled - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { rc = init_performance_profiling(runtime, num_aicore, device_id); if (rc != 0) { LOG_ERROR("init_performance_profiling failed: %d", rc); @@ -540,18 +540,18 @@ int DeviceRunner::run( { // Poll and collect performance data in a separate collector thread std::thread collector_thread; - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { collector_thread = create_thread([this, &runtime]() { poll_and_collect_performance_data(runtime.get_task_count()); }); } auto thread_guard = RAIIScopeGuard([&]() { - if (runtime.enable_profiling && collector_thread.joinable()) { + if (runtime.perf_level > 0 && collector_thread.joinable()) { collector_thread.join(); } }); auto collector_signal_guard = RAIIScopeGuard([this, &runtime]() { - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.signal_execution_complete(); } }); @@ -588,7 +588,7 @@ int DeviceRunner::run( } // Stop memory management, drain remaining buffers, collect phase data, export - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.stop_memory_manager(); perf_collector_.drain_remaining_buffers(); perf_collector_.scan_remaining_perf_buffers(); @@ -872,6 +872,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i return rtFree(dev_ptr); }; + perf_collector_.set_perf_level(runtime.perf_level); return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb); } diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index d672355f6..3d2369025 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -26,6 +26,7 @@ #include "common/unified_log.h" #include "device_runner.h" #include "host/raii_scope_guard.h" +#include "host/runtime_profiling_mode.h" #include "runtime.h" extern "C" { @@ -162,9 +163,7 @@ int run_runtime( return rc; } - if (enable_profiling) { - r->enable_profiling = true; - } + set_runtime_profiling_mode(r, enable_profiling); std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 0979052b6..7a32eb2d4 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -314,7 +314,7 @@ int DeviceRunner::run( last_runtime_ = &runtime; // Initialize performance profiling if enabled - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { rc = init_performance_profiling(runtime, num_aicore, device_id); if (rc != 0) { LOG_ERROR("init_performance_profiling failed: %d", rc); @@ -420,7 +420,7 @@ int DeviceRunner::run( // Poll and collect performance data during execution (if enabled) std::thread collector_thread; - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { collector_thread = create_thread([this, &runtime]() { poll_and_collect_performance_data(runtime.get_task_count()); }); @@ -442,13 +442,13 @@ int DeviceRunner::run( } // Signal collector that device execution is complete - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.signal_execution_complete(); } dump_collector_.signal_execution_complete(); // Wait for collector thread if it was launched - if (runtime.enable_profiling && collector_thread.joinable()) { + if (runtime.perf_level > 0 && collector_thread.joinable()) { collector_thread.join(); } if (dump_collector_thread.joinable()) { @@ -465,12 +465,12 @@ int DeviceRunner::run( } // Signal collector that device execution is complete - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.signal_execution_complete(); } // Wait for collector thread if it was launched - if (runtime.enable_profiling && collector_thread.joinable()) { + if (runtime.perf_level > 0 && collector_thread.joinable()) { collector_thread.join(); } } @@ -484,7 +484,7 @@ int DeviceRunner::run( } // Stop memory management, drain remaining buffers, collect phase data, export - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.stop_memory_manager(); perf_collector_.drain_remaining_buffers(); perf_collector_.scan_remaining_perf_buffers(); @@ -710,6 +710,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) { // ============================================================================= int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, int device_id) { + perf_collector_.set_perf_level(runtime.perf_level); // Define allocation callback (a2a3sim: use malloc) auto alloc_cb = [](size_t size) -> void * { return malloc(size); diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 3e7dfd89e..96f927c73 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -28,6 +28,7 @@ #include "cpu_sim_context.h" #include "device_runner.h" #include "runtime.h" +#include "host/runtime_profiling_mode.h" extern "C" { @@ -154,9 +155,7 @@ int run_runtime( } // Phase 2: profiling - if (enable_profiling) { - r->enable_profiling = true; - } + set_runtime_profiling_mode(r, enable_profiling); // Phase 3: launch std::vector aicpu_vec; diff --git a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp index 37e1f12c7..476d704a4 100644 --- a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp +++ b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp @@ -230,7 +230,7 @@ void perf_aicpu_switch_buffer(Runtime *runtime, int core_id, int thread_idx) { } void perf_aicpu_flush_buffers(Runtime *runtime, int thread_idx, const int *cur_thread_cores, int core_num) { - if (!runtime->enable_profiling) { + if (runtime->perf_level <= 0) { return; } diff --git a/src/a2a3/platform/src/host/performance_collector.cpp b/src/a2a3/platform/src/host/performance_collector.cpp index 79a3bd78b..7ee695437 100644 --- a/src/a2a3/platform/src/host/performance_collector.cpp +++ b/src/a2a3/platform/src/host/performance_collector.cpp @@ -77,7 +77,7 @@ void ProfMemoryManager::stop() { // Drain remaining done_queue and free buffers { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); while (!done_queue_.empty()) { CopyDoneInfo info = done_queue_.front(); done_queue_.pop(); @@ -99,7 +99,7 @@ void ProfMemoryManager::stop() { } bool ProfMemoryManager::try_pop_ready(ReadyBufferInfo &info) { - std::lock_guard lock(ready_mutex_); + std::scoped_lock lock(ready_mutex_); if (ready_queue_.empty()) { return false; } @@ -121,7 +121,7 @@ bool ProfMemoryManager::wait_pop_ready(ReadyBufferInfo &info, std::chrono::milli } void ProfMemoryManager::notify_copy_done(const CopyDoneInfo &info) { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); done_queue_.push(info); } @@ -210,7 +210,7 @@ void ProfMemoryManager::process_ready_entry( host_ptr = resolve_host_ptr(new_dev_ptr); } if (new_dev_ptr == nullptr) { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); while (!done_queue_.empty()) { CopyDoneInfo dinfo = done_queue_.front(); done_queue_.pop(); @@ -258,7 +258,7 @@ void ProfMemoryManager::process_ready_entry( info.buffer_seq = seq; { - std::lock_guard lock(ready_mutex_); + std::scoped_lock lock(ready_mutex_); ready_queue_.push(info); } ready_cv_.notify_one(); @@ -289,7 +289,7 @@ void ProfMemoryManager::process_ready_entry( host_ptr = resolve_host_ptr(new_dev_ptr); } if (new_dev_ptr == nullptr) { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); while (!done_queue_.empty()) { CopyDoneInfo dinfo = done_queue_.front(); done_queue_.pop(); @@ -335,7 +335,7 @@ void ProfMemoryManager::process_ready_entry( info.buffer_seq = seq; { - std::lock_guard lock(ready_mutex_); + std::scoped_lock lock(ready_mutex_); ready_queue_.push(info); } ready_cv_.notify_one(); @@ -348,7 +348,7 @@ void ProfMemoryManager::mgmt_loop() { while (running_.load()) { // 1. Recycle done queue: move completed buffers to recycled pools for reuse { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); while (!done_queue_.empty()) { CopyDoneInfo info = done_queue_.front(); done_queue_.pop(); @@ -561,8 +561,9 @@ int PerformanceCollector::initialize( free_cb_ = free_cb; // Step 1: Calculate shared memory size (slot arrays only, no actual buffers) - int num_phase_threads = PLATFORM_MAX_AICPU_THREADS; - size_t total_size = calc_perf_data_size_with_phases(num_aicore, num_phase_threads); + int num_phase_threads = (perf_level_ >= 3) ? PLATFORM_MAX_AICPU_THREADS : 0; + size_t total_size = (num_phase_threads > 0) ? calc_perf_data_size_with_phases(num_aicore, num_phase_threads) : + calc_perf_data_size(num_aicore); LOG_DEBUG("Shared memory allocation plan:"); LOG_DEBUG(" Number of cores: %d", num_aicore); @@ -651,41 +652,43 @@ int PerformanceCollector::initialize( num_aicore * (PLATFORM_PROF_BUFFERS_PER_CORE - 1) ); - // Step 6: Initialize PhaseBufferStates — 1 buffer per thread in free_queue, rest to recycled pool - for (int t = 0; t < num_phase_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t); - memset(state, 0, sizeof(PhaseBufferState)); - - state->free_queue.head = 0; - state->free_queue.tail = 0; - state->current_buf_ptr = 0; - state->current_buf_seq = 0; - - for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) { - void *host_buf_ptr = nullptr; - void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr); - if (dev_buf_ptr == nullptr) { - LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s); - return -1; - } - PhaseBuffer *buf = reinterpret_cast(host_buf_ptr); - memset(buf, 0, sizeof(PhaseBuffer)); - buf->count = 0; + // Step 6: Initialize PhaseBufferStates (only when phase recording enabled) + if (num_phase_threads > 0) { + for (int t = 0; t < num_phase_threads; t++) { + PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t); + memset(state, 0, sizeof(PhaseBufferState)); + + state->free_queue.head = 0; + state->free_queue.tail = 0; + state->current_buf_ptr = 0; + state->current_buf_seq = 0; + + for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) { + void *host_buf_ptr = nullptr; + void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr); + if (dev_buf_ptr == nullptr) { + LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s); + return -1; + } + PhaseBuffer *buf = reinterpret_cast(host_buf_ptr); + memset(buf, 0, sizeof(PhaseBuffer)); + buf->count = 0; - if (s == 0) { - state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); - } else { - memory_manager_.recycled_phase_buffers_.push_back(dev_buf_ptr); + if (s == 0) { + state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); + } else { + memory_manager_.recycled_phase_buffers_.push_back(dev_buf_ptr); + } } + wmb(); + state->free_queue.tail = 1; + wmb(); } - wmb(); - state->free_queue.tail = 1; - wmb(); + LOG_DEBUG( + "Initialized %d PhaseBufferStates: 1 buffer/thread, %d in recycled pool", num_phase_threads, + num_phase_threads * (PLATFORM_PROF_BUFFERS_PER_THREAD - 1) + ); } - LOG_DEBUG( - "Initialized %d PhaseBufferStates: 1 buffer/thread, %d in recycled pool", num_phase_threads, - num_phase_threads * (PLATFORM_PROF_BUFFERS_PER_THREAD - 1) - ); wmb(); @@ -706,8 +709,8 @@ void PerformanceCollector::start_memory_manager(const ThreadFactory &thread_fact } memory_manager_.start( - perf_shared_mem_host_, num_aicore_, PLATFORM_MAX_AICPU_THREADS, alloc_cb_, register_cb_, free_cb_, device_id_, - thread_factory + perf_shared_mem_host_, num_aicore_, (perf_level_ >= 3) ? PLATFORM_MAX_AICPU_THREADS : 0, alloc_cb_, + register_cb_, free_cb_, device_id_, thread_factory ); } @@ -1245,7 +1248,17 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) { } // Step 7: Write JSON data - int version = has_phase_data_ ? 2 : 1; + int version; + if (perf_level_ <= 1) { + version = 0; + } else if (has_phase_data_) { + version = 2; + } else { + if (perf_level_ >= 3) { + LOG_WARN("perf_level=%d but no phase data collected; writing version=1", perf_level_); + } + version = 1; + } outfile << "{\n"; outfile << " \"version\": " << version << ",\n"; outfile << " \"tasks\": [\n"; @@ -1258,8 +1271,6 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) { double start_us = cycles_to_us(record.start_time - base_time_cycles); double end_us = cycles_to_us(record.end_time - base_time_cycles); double duration_us = end_us - start_us; - double dispatch_us = (record.dispatch_time > 0) ? cycles_to_us(record.dispatch_time - base_time_cycles) : 0.0; - double finish_us = (record.finish_time > 0) ? cycles_to_us(record.finish_time - base_time_cycles) : 0.0; const char *core_type_str = (record.core_type == CoreType::AIC) ? "aic" : "aiv"; @@ -1271,20 +1282,27 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) { outfile << " \"ring_id\": " << static_cast(record.task_id >> 32) << ",\n"; outfile << " \"start_time_us\": " << std::fixed << std::setprecision(3) << start_us << ",\n"; outfile << " \"end_time_us\": " << std::fixed << std::setprecision(3) << end_us << ",\n"; - outfile << " \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << ",\n"; - outfile << " \"dispatch_time_us\": " << std::fixed << std::setprecision(3) << dispatch_us << ",\n"; - outfile << " \"finish_time_us\": " << std::fixed << std::setprecision(3) << finish_us << ",\n"; - outfile << " \"fanout\": ["; - int safe_fanout_count = - (record.fanout_count >= 0 && record.fanout_count <= RUNTIME_MAX_FANOUT) ? record.fanout_count : 0; - for (int j = 0; j < safe_fanout_count; ++j) { - outfile << record.fanout[j]; - if (j < safe_fanout_count - 1) { - outfile << ", "; + if (perf_level_ >= 2) { + double dispatch_us = + (record.dispatch_time > 0) ? cycles_to_us(record.dispatch_time - base_time_cycles) : 0.0; + double finish_us = (record.finish_time > 0) ? cycles_to_us(record.finish_time - base_time_cycles) : 0.0; + outfile << " \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << ",\n"; + outfile << " \"dispatch_time_us\": " << std::fixed << std::setprecision(3) << dispatch_us << ",\n"; + outfile << " \"finish_time_us\": " << std::fixed << std::setprecision(3) << finish_us << ",\n"; + outfile << " \"fanout\": ["; + int safe_fanout_count = + (record.fanout_count >= 0 && record.fanout_count <= RUNTIME_MAX_FANOUT) ? record.fanout_count : 0; + for (int j = 0; j < safe_fanout_count; ++j) { + outfile << record.fanout[j]; + if (j < safe_fanout_count - 1) { + outfile << ", "; + } } + outfile << "],\n"; + outfile << " \"fanout_count\": " << record.fanout_count << "\n"; + } else { + outfile << " \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << "\n"; } - outfile << "],\n"; - outfile << " \"fanout_count\": " << record.fanout_count << "\n"; outfile << " }"; if (i < tagged_records.size() - 1) { outfile << ","; diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h index ca21bd33c..e6623dd8a 100644 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h @@ -168,6 +168,7 @@ class Runtime { // Profiling support bool enable_profiling; // Enable profiling flag + int perf_level = 0; // Derived from enable_profiling: 0=off, 2=task+fanout // Orchestrator-to-scheduler transition control // When true, orchestrator threads convert to scheduler threads after orchestration completes. diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 3b4f541ba..c9ac022ee 100644 --- a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -646,7 +646,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const int verification_warning_count = 0; const int MAX_VERIFICATION_WARNINGS = 10; - bool profiling_enabled = runtime.enable_profiling; + bool profiling_enabled = runtime.perf_level > 0; // Extract array pointers as local variables for better readability and performance int *cur_ready_queue_aic = cur_ready_queue_aic_[thread_idx]; diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp index 25af6e4c7..85cdf5927 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp @@ -46,6 +46,7 @@ Runtime::Runtime() { worker_count = 0; sche_cpu_num = 1; enable_profiling = false; + perf_level = 0; perf_data_base = 0; tensor_pair_count = 0; tensor_info_storage_ = nullptr; diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index f528acf81..30d6cd244 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -203,7 +203,8 @@ class Runtime { int sche_cpu_num; // Number of AICPU threads for scheduling // Profiling support - bool enable_profiling; // Enable profiling flag + bool enable_profiling; // Legacy flag for host_build_graph runtime paths + int perf_level; // 0=off, >0=profiling on (extended mode encoding) uint64_t perf_data_base; // Performance data shared memory base address (device-side) // Task storage diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 4113cba8b..4840a145e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -88,7 +88,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // Cache per-core dispatch payload pointer (set by AICPU before aicpu_ready) __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task); - bool profiling_enabled = runtime->enable_profiling; + bool profiling_enabled = (runtime->perf_level > 0); // Phase 4: Main execution loop - poll register for tasks until exit signal // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index cc4fe8ab8..c2a39182b 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -479,7 +479,7 @@ struct AicpuExecutor { PTO2LocalReadyBuffer *local_bufs, CoreType ct #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_complete_count, uint64_t dispatch_ts + bool task_recording_enabled, int perf_level, uint32_t &phase_complete_count, uint64_t dispatch_ts #endif #if PTO2_SCHED_PROFILING , @@ -542,26 +542,31 @@ struct AicpuExecutor { } #if PTO2_PROFILING - if (profiling_enabled) { + if (task_recording_enabled) { #if PTO2_SCHED_PROFILING uint64_t t_perf_start = get_sys_cnt_aicpu(); #endif Handshake *h = &hank[core_id]; - uint64_t finish_ts = get_sys_cnt_aicpu(); PerfBuffer *perf_buf = reinterpret_cast(h->perf_records_addr); uint64_t fanout_arr[RUNTIME_MAX_FANOUT]; int32_t fanout_n = 0; - PTO2DepListEntry *cur = slot_state.fanout_head; - while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) { - fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw; - cur = cur->next; + uint64_t finish_ts = 0; + + if (perf_level >= 2) { + finish_ts = get_sys_cnt_aicpu(); + PTO2DepListEntry *cur = slot_state.fanout_head; + while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) { + fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw; + cur = cur->next; + } } int32_t perf_slot_idx = static_cast(subslot); if (perf_aicpu_complete_record( perf_buf, static_cast(expected_reg_task_id), slot_state.task->task_id.raw, - slot_state.task->kernel_id[perf_slot_idx], ct, dispatch_ts, finish_ts, fanout_arr, fanout_n + slot_state.task->kernel_id[perf_slot_idx], ct, dispatch_ts, finish_ts, + (perf_level >= 2) ? fanout_arr : nullptr, fanout_n ) != 0) { DEV_ERROR( "Core %d: perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id, @@ -600,7 +605,7 @@ struct AicpuExecutor { PTO2LocalReadyBuffer *local_bufs #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_complete_count + bool task_recording_enabled, int perf_level, uint32_t &phase_complete_count #endif #if PTO2_SCHED_PROFILING , @@ -622,7 +627,7 @@ struct AicpuExecutor { int32_t reg_state = EXTRACT_TASK_STATE(reg_val); #if PTO2_SCHED_PROFILING - if (profiling_enabled) { + if (task_recording_enabled) { complete_probe_count++; } #endif @@ -632,7 +637,7 @@ struct AicpuExecutor { if (!t.matched) continue; #if PTO2_SCHED_PROFILING - if (profiling_enabled && (t.running_done || t.pending_done)) { + if (task_recording_enabled && (t.running_done || t.pending_done)) { complete_hit_count++; } #endif @@ -646,7 +651,7 @@ struct AicpuExecutor { completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs, CT #if PTO2_PROFILING , - profiling_enabled, phase_complete_count, core.pending_dispatch_timestamp + task_recording_enabled, perf_level, phase_complete_count, core.pending_dispatch_timestamp #endif #if PTO2_SCHED_PROFILING , @@ -662,7 +667,7 @@ struct AicpuExecutor { completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs, CT #if PTO2_PROFILING , - profiling_enabled, phase_complete_count, core.running_dispatch_timestamp + task_recording_enabled, perf_level, phase_complete_count, core.running_dispatch_timestamp #endif #if PTO2_SCHED_PROFILING , @@ -809,7 +814,7 @@ struct AicpuExecutor { PTO2SubtaskSlot subslot, bool to_pending #if PTO2_PROFILING , - bool profiling_enabled + bool task_recording_enabled #endif ) { CoreTracker &tracker = core_trackers_[thread_idx]; @@ -849,7 +854,7 @@ struct AicpuExecutor { core_exec_state.pending_slot_state = &slot_state; core_exec_state.pending_reg_task_id = static_cast(reg_task_id); #if PTO2_PROFILING - if (profiling_enabled) { + if (task_recording_enabled && runtime->perf_level >= 2) { core_exec_state.pending_dispatch_timestamp = get_sys_cnt_aicpu(); } #endif @@ -858,7 +863,7 @@ struct AicpuExecutor { core_exec_state.running_slot_state = &slot_state; core_exec_state.running_reg_task_id = static_cast(reg_task_id); #if PTO2_PROFILING - if (profiling_enabled) { + if (task_recording_enabled && runtime->perf_level >= 2) { core_exec_state.running_dispatch_timestamp = get_sys_cnt_aicpu(); } #endif @@ -866,7 +871,7 @@ struct AicpuExecutor { tracker.change_core_state(core_offset); } #if PTO2_PROFILING - if (profiling_enabled) { + if (task_recording_enabled) { if (core_exec_state.dispatch_count >= PLATFORM_PROF_BUFFER_SIZE) { perf_aicpu_switch_buffer(runtime, core_id, thread_idx); core_exec_state.dispatch_count = 0; @@ -888,7 +893,7 @@ struct AicpuExecutor { Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state #if PTO2_PROFILING , - bool profiling_enabled + bool task_recording_enabled #endif ) { CoreTracker &tracker = core_trackers_[thread_idx]; @@ -899,7 +904,7 @@ struct AicpuExecutor { false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } @@ -909,7 +914,7 @@ struct AicpuExecutor { false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } @@ -919,7 +924,7 @@ struct AicpuExecutor { false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } @@ -959,7 +964,7 @@ struct AicpuExecutor { PTO2ResourceShape shape #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_dispatch_count + bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count #endif ) { CoreTracker &tracker = core_trackers_[thread_idx]; @@ -981,7 +986,7 @@ struct AicpuExecutor { runtime, thread_idx, cluster_offset, slot_state #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } else if (shape == PTO2ResourceShape::AIC) { @@ -990,7 +995,7 @@ struct AicpuExecutor { false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } else { // AIV @@ -1001,12 +1006,14 @@ struct AicpuExecutor { runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } #if PTO2_PROFILING - phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask)); + if (phase_recording_enabled) { + phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask)); + } #endif } @@ -1030,7 +1037,7 @@ struct AicpuExecutor { Runtime *runtime, int32_t block_num #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_dispatch_count + bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count #endif ) { PTO2TaskSlotState *slot_state = drain_state_.pending_task; @@ -1047,7 +1054,7 @@ struct AicpuExecutor { runtime, t, valid.pop_first(), *slot_state, shape #if PTO2_PROFILING , - profiling_enabled, phase_dispatch_count + task_recording_enabled, phase_recording_enabled, phase_dispatch_count #endif ); slot_state->next_block_idx++; @@ -1082,7 +1089,7 @@ struct AicpuExecutor { Runtime *runtime, int32_t thread_idx #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_dispatch_count + bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count #endif ) { // Spin until drain is fully initialized (sentinel -1 → block_num > 0). @@ -1139,7 +1146,7 @@ struct AicpuExecutor { runtime, block_num #if PTO2_PROFILING , - profiling_enabled, phase_dispatch_count + task_recording_enabled, phase_recording_enabled, phase_dispatch_count #endif ); } @@ -1532,9 +1539,10 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #if PTO2_PROFILING // Assign perf buffers to cores early so profiling captures all tasks // (total_tasks written to header later when orchestrator completes) - if (runtime->enable_profiling) { + if (runtime->perf_level > 0) { perf_aicpu_init_profiling(runtime); - // Initialize phase profiling for scheduler threads + orchestrator threads + } + if (runtime->perf_level >= 3) { perf_aicpu_init_phase_profiling(runtime, sched_thread_num_); perf_aicpu_set_orch_thread_idx(sched_thread_num_); } @@ -1558,7 +1566,9 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa int32_t idle_iterations = 0; int32_t last_progress_count = 0; #if PTO2_PROFILING - bool profiling_enabled = runtime->enable_profiling; + int perf_level = runtime->perf_level; + bool task_recording_enabled = (perf_level > 0); + bool phase_recording_enabled = (perf_level >= 3); #endif // Scheduler profiling counters @@ -1681,7 +1691,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa deferred_release_slot_states, deferred_release_count, local_bufs #if PTO2_PROFILING , - profiling_enabled, phase_complete_count + task_recording_enabled, perf_level, phase_complete_count #endif #if PTO2_SCHED_PROFILING , @@ -1699,7 +1709,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa deferred_release_slot_states, deferred_release_count, local_bufs #if PTO2_PROFILING , - profiling_enabled, phase_complete_count + task_recording_enabled, perf_level, phase_complete_count #endif #if PTO2_SCHED_PROFILING , @@ -1731,7 +1741,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa CYCLE_COUNT_LAP(sched_idle_cycle); } else { CYCLE_COUNT_LAP(sched_complete_cycle); - if (profiling_enabled && phase_complete_count > 0) { + if (phase_recording_enabled && phase_complete_count > 0) { perf_aicpu_record_phase( thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, sched_loop_count, phase_complete_count ); @@ -1750,7 +1760,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa runtime, thread_idx #if PTO2_PROFILING , - profiling_enabled, phase_dispatch_count + task_recording_enabled, phase_recording_enabled, phase_dispatch_count #endif ); continue; @@ -1840,7 +1850,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa runtime, thread_idx, current_valid_cluster_offset, *slot_state, shape #if PTO2_PROFILING , - profiling_enabled, phase_dispatch_count + task_recording_enabled, phase_recording_enabled, phase_dispatch_count #endif ); slot_state->next_block_idx++; @@ -1910,7 +1920,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa PTO2SubtaskSlot::AIC, true #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); slot_state->next_block_idx++; @@ -1948,7 +1958,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa CYCLE_COUNT_LAP(sched_idle_cycle); } else { CYCLE_COUNT_LAP(sched_dispatch_cycle); - if (profiling_enabled && phase_dispatch_count > 0) { + if (phase_recording_enabled && phase_dispatch_count > 0) { perf_aicpu_record_phase( thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, sched_loop_count, phase_dispatch_count ); @@ -2108,7 +2118,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa } #if PTO2_PROFILING CYCLE_COUNT_LAP(sched_idle_cycle); - if (profiling_enabled) { + if (phase_recording_enabled) { perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, sched_loop_count, 0); _t0_phase = _t1; } @@ -2275,9 +2285,10 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #endif #if PTO2_PROFILING - // Flush performance buffers for cores managed by this thread - if (profiling_enabled) { + if (task_recording_enabled) { perf_aicpu_flush_buffers(runtime, thread_idx, core_assignments_[thread_idx], core_num); + } + if (phase_recording_enabled) { perf_aicpu_flush_phase_buffers(thread_idx); } #endif @@ -2479,7 +2490,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } #if PTO2_PROFILING - rt->orchestrator.enable_profiling = runtime->enable_profiling; + rt->orchestrator.perf_level = runtime->perf_level; #endif // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). @@ -2503,7 +2514,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } #if PTO2_PROFILING - if (runtime->enable_profiling) { + if (runtime->perf_level >= 3) { perf_aicpu_set_orch_thread_idx(thread_idx); } #endif @@ -2588,7 +2599,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { #if PTO2_PROFILING // Write orchestrator summary to shared memory for host-side export (only if profiling enabled) - if (runtime->enable_profiling) { + if (runtime->perf_level >= 3) { AicpuOrchSummary orch_summary = {}; orch_summary.start_time = orch_cycle_start; orch_summary.end_time = orch_cycle_end; @@ -2608,7 +2619,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { #if PTO2_PROFILING // Write core-to-thread mapping (one-time, after orchestration) - if (runtime->enable_profiling) { + if (runtime->perf_level >= 3) { perf_aicpu_write_core_assignments( core_assignments_, core_count_per_thread_, sched_thread_num_, cores_total_num_ ); @@ -2632,7 +2643,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { pto2_submitted_tasks = pto2_task_count; #endif total_tasks_ = pto2_task_count; - if (runtime->enable_profiling && pto2_task_count > 0) { + if (runtime->perf_level > 0 && pto2_task_count > 0) { perf_aicpu_update_total_tasks(runtime, static_cast(pto2_task_count)); } int32_t inline_completed = static_cast(rt->orchestrator.inline_completed_tasks); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 85420e245..558b575f9 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -95,8 +95,8 @@ __attribute__((weak, visibility("hidden"))) void perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level) static uint32_t g_orch_submit_idx = 0; -#define CYCLE_COUNT_START() \ - bool _prof_active = orch->enable_profiling; \ +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->perf_level >= 3); \ uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0 #define CYCLE_COUNT_LAP(acc) \ do { \ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 0ad5e6873..3089a0442 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -75,8 +75,7 @@ struct PTO2OrchestratorState { int32_t total_cluster_count{0}; // AIC cores = MIX clusters int32_t total_aiv_count{0}; // AIV cores (= 2 × clusters on standard hardware) #if PTO2_PROFILING - // Runtime profiling switch copied from Runtime::enable_profiling. - bool enable_profiling; + int perf_level; #endif // === GM HEAP (for output buffers) === diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 2d0e0b4b4..2b0fff2e2 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -171,7 +171,7 @@ class Runtime { uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; // Profiling support - bool enable_profiling; // Enable profiling flag + int perf_level = 0; // 0=off, 1=AICore-only, 2=task+fanout, 3=full // Orchestrator-to-scheduler transition control // When true, orchestrator threads convert to scheduler threads after orchestration completes. diff --git a/src/common/distributed/dist_chip_process.cpp b/src/common/distributed/dist_chip_process.cpp index 3bbe6e003..e79a4c9cb 100644 --- a/src/common/distributed/dist_chip_process.cpp +++ b/src/common/distributed/dist_chip_process.cpp @@ -56,7 +56,7 @@ void DistChipProcess::run(uint64_t callable, TaskArgsView args, const ChipCallCo // Write config fields. int32_t block_dim = config.block_dim; int32_t aicpu_tn = config.aicpu_thread_num; - int32_t profiling = config.enable_profiling ? 1 : 0; + int32_t profiling = config.perf_level; std::memcpy(base() + OFF_BLOCK_DIM, &block_dim, sizeof(int32_t)); std::memcpy(base() + OFF_AICPU_THREAD_NUM, &aicpu_tn, sizeof(int32_t)); std::memcpy(base() + OFF_ENABLE_PROFILING, &profiling, sizeof(int32_t)); diff --git a/src/common/task_interface/chip_call_config.h b/src/common/task_interface/chip_call_config.h index 1f1eb7721..1b8e30b81 100644 --- a/src/common/task_interface/chip_call_config.h +++ b/src/common/task_interface/chip_call_config.h @@ -11,7 +11,7 @@ /** * ChipCallConfig — per-NEXT_LEVEL-task config (block_dim, aicpu_thread_num, - * enable_profiling). Lives here (rather than chip_worker.h) so distributed + * enable_profiling/perf_level). Lives here (rather than chip_worker.h) so distributed * task slot state can store it directly without pulling in the full * ChipWorker header (which depends on dist_types.h). */ @@ -21,6 +21,6 @@ struct ChipCallConfig { int block_dim = 24; int aicpu_thread_num = 3; - bool enable_profiling = false; + int perf_level = 0; // 0=off, 1=AICore-only, 2=task+fanout, 3=full bool enable_dump_tensor = false; }; diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index e919fcc27..3a3a7fa32 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -205,7 +205,7 @@ void ChipWorker::run(const void *callable, const void *args, const ChipCallConfi int rc = run_runtime_fn_( device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(), - aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_profiling ? 1 : 0, + aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.perf_level, config.enable_dump_tensor ? 1 : 0 ); if (rc != 0) { diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index 59b85f985..8a3947e3b 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -31,7 +31,7 @@ def test_defaults(self): config = ChipCallConfig() assert config.block_dim == 24 assert config.aicpu_thread_num == 3 - assert config.enable_profiling is False + assert config.enable_profiling == 0 def test_setters(self): config = ChipCallConfig() @@ -40,13 +40,13 @@ def test_setters(self): config.enable_profiling = True assert config.block_dim == 32 assert config.aicpu_thread_num == 4 - assert config.enable_profiling is True + assert config.enable_profiling > 0 def test_repr(self): config = ChipCallConfig() r = repr(config) assert "block_dim=24" in r - assert "enable_profiling=False" in r + assert "enable_profiling=0" in r # ============================================================================ diff --git a/tools/swimlane_converter.py b/tools/swimlane_converter.py index 9acd7ab9a..1aaa9f69f 100644 --- a/tools/swimlane_converter.py +++ b/tools/swimlane_converter.py @@ -120,8 +120,8 @@ def read_perf_data(filepath): raise ValueError(f"Missing required field: {field}") # Validate version - if data["version"] not in [1, 2]: - raise ValueError(f"Unsupported version: {data['version']} (expected 1 or 2)") + if data["version"] not in [0, 1, 2]: + raise ValueError(f"Unsupported version: {data['version']} (expected 0/1/2)") return data @@ -482,7 +482,8 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 dur = task["duration_us"] # Build fanout hint string (packed ids → rXtY / tY for readability) - fanout_str = "[" + ", ".join(format_task_display(x) for x in task["fanout"]) + "]" + fanout_list = task.get("fanout", []) + fanout_str = "[" + ", ".join(format_task_display(x) for x in fanout_list) + "]" # Get function name if available func_id = task["func_id"] @@ -632,7 +633,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 src_tid = core_to_tid[task["core_id"]] src_ts_end = task["end_time_us"] - for succ_task_id in task["fanout"]: + for succ_task_id in task.get("fanout", []): if succ_task_id not in task_map: if verbose: print( @@ -864,7 +865,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 src_tid = task_to_aicpu_tid.get(task["task_id"], core_to_tid[task["core_id"]]) src_aicpu_eid = task_to_aicpu_event_id.get(task["task_id"]) - for succ_task_id in task["fanout"]: + for succ_task_id in task.get("fanout", []): if succ_task_id not in task_map: continue