Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,12 @@ def pytest_addoption(parser):
"--skip-golden", action="store_true", default=False, help="Skip golden comparison (benchmark mode)"
)
parser.addoption(
"--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"
"--enable-profiling",
type=int,
nargs="?",
const=3,
default=0,
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
)
parser.addoption("--dump-tensor", action="store_true", default=False, help="Dump per-task tensor I/O at runtime")
parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")
Expand Down
8 changes: 6 additions & 2 deletions examples/scripts/run_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,12 @@ def compute_golden(tensors: dict, params: dict) -> None:

parser.add_argument(
"--enable-profiling",
action="store_true",
help="Enable profiling and generate swimlane.json",
type=int,
nargs="?",
const=3,
default=0,
metavar="LEVEL",
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
)

parser.add_argument(
Expand Down
28 changes: 20 additions & 8 deletions python/bindings/task_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -540,12 +540,24 @@ NB_MODULE(_task_interface, m) {
.def(nb::init<>())
.def_rw("block_dim", &ChipCallConfig::block_dim)
.def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
.def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
.def_prop_rw(
"enable_profiling",
[](const ChipCallConfig &self) {
return self.perf_level;
},
[](ChipCallConfig &self, nb::object v) {
if (nb::isinstance<nb::bool_>(v)) {
self.perf_level = nb::cast<bool>(v) ? 3 : 0;
} else {
self.perf_level = nb::cast<int>(v);
}
}
)
.def_rw("enable_dump_tensor", &ChipCallConfig::enable_dump_tensor)
.def("__repr__", [](const ChipCallConfig &self) -> std::string {
std::ostringstream os;
os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
<< ", enable_profiling=" << (self.enable_profiling ? "True" : "False")
<< ", enable_profiling=" << self.perf_level
<< ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") << ")";
return os.str();
});
Expand All @@ -571,29 +583,29 @@ NB_MODULE(_task_interface, m) {
.def(
"run_raw",
[](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
bool enable_profiling) {
int perf_level) {
ChipCallConfig config;
config.block_dim = block_dim;
config.aicpu_thread_num = aicpu_thread_num;
config.enable_profiling = enable_profiling;
config.perf_level = perf_level;
self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
},
nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
nb::arg("enable_profiling") = false, "Run with a raw ChipStorageTaskArgs POD pointer."
nb::arg("perf_level") = 0, "Run with a raw ChipStorageTaskArgs POD pointer."
)
.def(
"run_from_blob",
[](ChipWorker &self, uint64_t callable, uint64_t blob_ptr, int block_dim, int aicpu_thread_num,
bool enable_profiling) {
int perf_level) {
ChipCallConfig config;
config.block_dim = block_dim;
config.aicpu_thread_num = aicpu_thread_num;
config.enable_profiling = enable_profiling;
config.perf_level = perf_level;
TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(blob_ptr));
self.run(callable, view, config);
},
nb::arg("callable"), nb::arg("blob_ptr"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
nb::arg("enable_profiling") = false,
nb::arg("perf_level") = 0,
"Decode a length-prefixed TaskArgs blob ([T][S][tensors][scalars]) at "
"blob_ptr and dispatch to the runtime. Used from forked chip processes "
"reading the WorkerThread mailbox."
Expand Down
2 changes: 1 addition & 1 deletion python/simpler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def _chip_process_loop(

error = 0
try:
cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, bool(profiling))
cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, profiling)
except Exception: # noqa: BLE001
error = 1
struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)
Expand Down
16 changes: 11 additions & 5 deletions simpler_setup/code_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ def _load_module_from_path(module_path: Path, module_name: str):
return module


def _normalize_perf_level(v) -> int:
if isinstance(v, bool):
return 3 if v else 0
return int(v)


def _kernel_config_runtime_env(kernel_config_module, kernels_dir: Path) -> dict[str, str]:
"""
Optional per-example environment variables for runtime compilation.
Expand Down Expand Up @@ -192,7 +198,7 @@ def __init__( # noqa: PLR0913
golden_path: str,
device_id: Optional[int] = None,
platform: str = "a2a3",
enable_profiling: bool = False,
enable_profiling: int = 0,
enable_dump_tensor: bool = False,
run_all_cases: bool = False,
case_name: Optional[str] = None,
Expand All @@ -212,7 +218,7 @@ def __init__( # noqa: PLR0913
self.kernels_dir = Path(kernels_dir).resolve()
self.golden_path = Path(golden_path).resolve()
self.platform = platform
self.enable_profiling = enable_profiling
self._perf_level = _normalize_perf_level(enable_profiling)
self.enable_dump_tensor = enable_dump_tensor
self.skip_golden = skip_golden
self.project_root = PROJECT_ROOT
Expand Down Expand Up @@ -607,9 +613,9 @@ def _compile_one_kernel(kernel):
config = ChipCallConfig()
config.block_dim = self.block_dim
config.aicpu_thread_num = self.aicpu_thread_num
if self.enable_profiling and round_idx == 0:
config.enable_profiling = True
logger.info("Profiling enabled")
if self._perf_level > 0 and round_idx == 0:
config.enable_profiling = self._perf_level
logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")
if self.enable_dump_tensor:
config.enable_dump_tensor = True
logger.info("Dump tensor enabled")
Expand Down
16 changes: 12 additions & 4 deletions simpler_setup/scene_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,7 @@ def build_callable(self, platform):
return self._compile_l3_callables(platform)
raise ValueError(f"Unsupported level: {self._st_level}")

def _build_config(self, config_dict, enable_profiling=False, enable_dump_tensor=False):
def _build_config(self, config_dict, enable_profiling=0, enable_dump_tensor=False):
from simpler.task_interface import ChipCallConfig # noqa: PLC0415

config = ChipCallConfig()
Expand Down Expand Up @@ -791,7 +791,7 @@ def _run_and_validate_l2(

config = self._build_config(
config_dict,
enable_profiling=(enable_profiling and round_idx == 0),
enable_profiling=(enable_profiling if round_idx == 0 else 0),
enable_dump_tensor=enable_dump_tensor,
)

Expand Down Expand Up @@ -847,7 +847,7 @@ def _run_and_validate_l3(

config = self._build_config(
config_dict,
enable_profiling=(enable_profiling and round_idx == 0),
enable_profiling=(enable_profiling if round_idx == 0 else 0),
enable_dump_tensor=enable_dump_tensor,
)

Expand Down Expand Up @@ -948,7 +948,15 @@ def run_module(module_name):
)
parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)")
parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)")
parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)")
parser.add_argument(
"--enable-profiling",
type=int,
nargs="?",
const=3,
default=0,
metavar="LEVEL",
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
)
parser.add_argument("--dump-tensor", action="store_true", help="Dump per-task tensor I/O at runtime")
parser.add_argument("--build", action="store_true", help="Compile runtime from source")
parser.add_argument(
Expand Down
9 changes: 9 additions & 0 deletions src/a2a3/platform/include/host/performance_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,12 @@ class PerformanceCollector {
*/
bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }

/**
* Set profiling level before initialize().
* 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
*/
void set_perf_level(int level) { perf_level_ = level; }

/**
* Drain remaining buffers from the memory manager's ready queue
*
Expand Down Expand Up @@ -387,6 +393,9 @@ class PerformanceCollector {
PerfRegisterCallback register_cb_{nullptr};
PerfFreeCallback free_cb_{nullptr};

// Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
int perf_level_{0};

// Memory manager
ProfMemoryManager memory_manager_;

Expand Down
37 changes: 37 additions & 0 deletions src/a2a3/platform/include/host/runtime_profiling_mode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/

/**
* Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
*
* Used by both onboard and sim pto_runtime_c_api.cpp implementations.
* Some runtime structs still carry a bool enable_profiling member alongside
* the newer int perf_level. This template detects the legacy member at
* compile time and keeps both in sync.
*/

#pragma once

#include <type_traits>

template <typename T, typename = void>
struct HasEnableProfilingMember : std::false_type {};

template <typename T>
struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};

template <typename R>
static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
runtime->perf_level = enable_profiling;
if constexpr (HasEnableProfilingMember<R>::value) {
runtime->enable_profiling = (enable_profiling > 0);
}
}
11 changes: 6 additions & 5 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ int DeviceRunner::run(
});

// Initialize performance profiling if enabled
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
rc = init_performance_profiling(runtime, num_aicore, device_id);
if (rc != 0) {
LOG_ERROR("init_performance_profiling failed: %d", rc);
Expand Down Expand Up @@ -540,18 +540,18 @@ int DeviceRunner::run(
{
// Poll and collect performance data in a separate collector thread
std::thread collector_thread;
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
collector_thread = create_thread([this, &runtime]() {
poll_and_collect_performance_data(runtime.get_task_count());
});
}
auto thread_guard = RAIIScopeGuard([&]() {
if (runtime.enable_profiling && collector_thread.joinable()) {
if (runtime.perf_level > 0 && collector_thread.joinable()) {
collector_thread.join();
}
});
auto collector_signal_guard = RAIIScopeGuard([this, &runtime]() {
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.signal_execution_complete();
}
});
Expand Down Expand Up @@ -588,7 +588,7 @@ int DeviceRunner::run(
}

// Stop memory management, drain remaining buffers, collect phase data, export
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.stop_memory_manager();
perf_collector_.drain_remaining_buffers();
perf_collector_.scan_remaining_perf_buffers();
Expand Down Expand Up @@ -872,6 +872,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
return rtFree(dev_ptr);
};

perf_collector_.set_perf_level(runtime.perf_level);
return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
}

Expand Down
5 changes: 2 additions & 3 deletions src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "common/unified_log.h"
#include "device_runner.h"
#include "host/raii_scope_guard.h"
#include "host/runtime_profiling_mode.h"
#include "runtime.h"

extern "C" {
Expand Down Expand Up @@ -162,9 +163,7 @@ int run_runtime(
return rc;
}

if (enable_profiling) {
r->enable_profiling = true;
}
set_runtime_profiling_mode(r, enable_profiling);

std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
Expand Down
15 changes: 8 additions & 7 deletions src/a2a3/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ int DeviceRunner::run(
last_runtime_ = &runtime;

// Initialize performance profiling if enabled
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
rc = init_performance_profiling(runtime, num_aicore, device_id);
if (rc != 0) {
LOG_ERROR("init_performance_profiling failed: %d", rc);
Expand Down Expand Up @@ -420,7 +420,7 @@ int DeviceRunner::run(

// Poll and collect performance data during execution (if enabled)
std::thread collector_thread;
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
collector_thread = create_thread([this, &runtime]() {
poll_and_collect_performance_data(runtime.get_task_count());
});
Expand All @@ -442,13 +442,13 @@ int DeviceRunner::run(
}

// Signal collector that device execution is complete
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.signal_execution_complete();
}
dump_collector_.signal_execution_complete();

// Wait for collector thread if it was launched
if (runtime.enable_profiling && collector_thread.joinable()) {
if (runtime.perf_level > 0 && collector_thread.joinable()) {
collector_thread.join();
}
if (dump_collector_thread.joinable()) {
Expand All @@ -465,12 +465,12 @@ int DeviceRunner::run(
}

// Signal collector that device execution is complete
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.signal_execution_complete();
}

// Wait for collector thread if it was launched
if (runtime.enable_profiling && collector_thread.joinable()) {
if (runtime.perf_level > 0 && collector_thread.joinable()) {
collector_thread.join();
}
}
Expand All @@ -484,7 +484,7 @@ int DeviceRunner::run(
}

// Stop memory management, drain remaining buffers, collect phase data, export
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.stop_memory_manager();
perf_collector_.drain_remaining_buffers();
perf_collector_.scan_remaining_perf_buffers();
Expand Down Expand Up @@ -710,6 +710,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
// =============================================================================

int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, int device_id) {
perf_collector_.set_perf_level(runtime.perf_level);
// Define allocation callback (a2a3sim: use malloc)
auto alloc_cb = [](size_t size) -> void * {
return malloc(size);
Expand Down
Loading
Loading