diff --git a/conftest.py b/conftest.py
index 97e43d534..ca744493c 100644
--- a/conftest.py
+++ b/conftest.py
@@ -84,7 +84,12 @@ def pytest_addoption(parser):
         "--skip-golden", action="store_true", default=False, help="Skip golden comparison (benchmark mode)"
     )
     parser.addoption(
-        "--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"
+        "--enable-profiling",
+        type=int,
+        nargs="?",
+        const=3,
+        default=0,
+        help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
     )
     parser.addoption("--dump-tensor", action="store_true", default=False, help="Dump per-task tensor I/O at runtime")
     parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")
diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py
index 839b5600f..db58f9dcd 100644
--- a/examples/scripts/run_example.py
+++ b/examples/scripts/run_example.py
@@ -139,8 +139,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
 
     parser.add_argument(
         "--enable-profiling",
-        action="store_true",
-        help="Enable profiling and generate swimlane.json",
+        type=int,
+        nargs="?",
+        const=3,
+        default=0,
+        metavar="LEVEL",
+        help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
     )
 
     parser.add_argument(
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index d949735c3..148b66824 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -540,12 +540,24 @@ NB_MODULE(_task_interface, m) {
         .def(nb::init<>())
         .def_rw("block_dim", &ChipCallConfig::block_dim)
         .def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
-        .def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
+        .def_prop_rw(
+            "enable_profiling",
+            [](const ChipCallConfig &self) {
+                return self.perf_level;
+            },
+            [](ChipCallConfig &self, nb::object v) {
+                if (nb::isinstance<nb::bool_>(v)) {
+                    self.perf_level = nb::cast<bool>(v) ? 3 : 0;
+                } else {
+                    self.perf_level = nb::cast<int>(v);
+                }
+            }
+        )
         .def_rw("enable_dump_tensor", &ChipCallConfig::enable_dump_tensor)
         .def("__repr__", [](const ChipCallConfig &self) -> std::string {
             std::ostringstream os;
             os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
-               << ", enable_profiling=" << (self.enable_profiling ? "True" : "False")
+               << ", enable_profiling=" << self.perf_level
                << ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") << ")";
             return os.str();
         });
@@ -571,29 +583,29 @@ NB_MODULE(_task_interface, m) {
         .def(
             "run_raw",
             [](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
-               bool enable_profiling) {
+               int perf_level) {
                 ChipCallConfig config;
                 config.block_dim = block_dim;
                 config.aicpu_thread_num = aicpu_thread_num;
-                config.enable_profiling = enable_profiling;
+                config.perf_level = perf_level;
                 self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
             },
             nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
-            nb::arg("enable_profiling") = false, "Run with a raw ChipStorageTaskArgs POD pointer."
+            nb::arg("perf_level") = 0, "Run with a raw ChipStorageTaskArgs POD pointer."
         )
         .def(
             "run_from_blob",
             [](ChipWorker &self, uint64_t callable, uint64_t blob_ptr, int block_dim, int aicpu_thread_num,
-               bool enable_profiling) {
+               int perf_level) {
                 ChipCallConfig config;
                 config.block_dim = block_dim;
                 config.aicpu_thread_num = aicpu_thread_num;
-                config.enable_profiling = enable_profiling;
+                config.perf_level = perf_level;
                 TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(blob_ptr));
                 self.run(callable, view, config);
             },
             nb::arg("callable"), nb::arg("blob_ptr"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
-            nb::arg("enable_profiling") = false,
+            nb::arg("perf_level") = 0,
             "Decode a length-prefixed TaskArgs blob ([T][S][tensors][scalars]) at "
             "blob_ptr and dispatch to the runtime. Used from forked chip processes "
             "reading the WorkerThread mailbox."
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index d40d4c235..da2e0b62a 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -161,7 +161,7 @@ def _chip_process_loop(
 
             error = 0
             try:
-                cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, bool(profiling))
+                cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, profiling)
             except Exception:  # noqa: BLE001
                 error = 1
             struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)
diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py
index a70f6096c..df761d117 100644
--- a/simpler_setup/code_runner.py
+++ b/simpler_setup/code_runner.py
@@ -123,6 +123,12 @@ def _load_module_from_path(module_path: Path, module_name: str):
     return module
 
 
+def _normalize_perf_level(v) -> int:
+    if isinstance(v, bool):
+        return 3 if v else 0
+    return int(v)
+
+
 def _kernel_config_runtime_env(kernel_config_module, kernels_dir: Path) -> dict[str, str]:
     """
     Optional per-example environment variables for runtime compilation.
@@ -192,7 +198,7 @@ def __init__(  # noqa: PLR0913
         golden_path: str,
         device_id: Optional[int] = None,
         platform: str = "a2a3",
-        enable_profiling: bool = False,
+        enable_profiling: int = 0,
         enable_dump_tensor: bool = False,
         run_all_cases: bool = False,
         case_name: Optional[str] = None,
@@ -212,7 +218,7 @@ def __init__(  # noqa: PLR0913
         self.kernels_dir = Path(kernels_dir).resolve()
         self.golden_path = Path(golden_path).resolve()
         self.platform = platform
-        self.enable_profiling = enable_profiling
+        self._perf_level = _normalize_perf_level(enable_profiling)
         self.enable_dump_tensor = enable_dump_tensor
         self.skip_golden = skip_golden
         self.project_root = PROJECT_ROOT
@@ -607,9 +613,9 @@ def _compile_one_kernel(kernel):
                 config = ChipCallConfig()
                 config.block_dim = self.block_dim
                 config.aicpu_thread_num = self.aicpu_thread_num
-                if self.enable_profiling and round_idx == 0:
-                    config.enable_profiling = True
-                    logger.info("Profiling enabled")
+                if self._perf_level > 0 and round_idx == 0:
+                    config.enable_profiling = self._perf_level
+                    logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")
                 if self.enable_dump_tensor:
                     config.enable_dump_tensor = True
                     logger.info("Dump tensor enabled")
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 600fcab34..f4fd3db43 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -700,7 +700,7 @@ def build_callable(self, platform):
             return self._compile_l3_callables(platform)
         raise ValueError(f"Unsupported level: {self._st_level}")
 
-    def _build_config(self, config_dict, enable_profiling=False, enable_dump_tensor=False):
+    def _build_config(self, config_dict, enable_profiling=0, enable_dump_tensor=False):
         from simpler.task_interface import ChipCallConfig  # noqa: PLC0415
 
         config = ChipCallConfig()
@@ -791,7 +791,7 @@ def _run_and_validate_l2(
 
             config = self._build_config(
                 config_dict,
-                enable_profiling=(enable_profiling and round_idx == 0),
+                enable_profiling=(enable_profiling if round_idx == 0 else 0),
                 enable_dump_tensor=enable_dump_tensor,
             )
 
@@ -847,7 +847,7 @@ def _run_and_validate_l3(
 
             config = self._build_config(
                 config_dict,
-                enable_profiling=(enable_profiling and round_idx == 0),
+                enable_profiling=(enable_profiling if round_idx == 0 else 0),
                 enable_dump_tensor=enable_dump_tensor,
             )
 
@@ -948,7 +948,15 @@ def run_module(module_name):
         )
         parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)")
         parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)")
-        parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)")
+        parser.add_argument(
+            "--enable-profiling",
+            type=int,
+            nargs="?",
+            const=3,
+            default=0,
+            metavar="LEVEL",
+            help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
+        )
         parser.add_argument("--dump-tensor", action="store_true", help="Dump per-task tensor I/O at runtime")
         parser.add_argument("--build", action="store_true", help="Compile runtime from source")
         parser.add_argument(
diff --git a/src/a2a3/platform/include/host/performance_collector.h b/src/a2a3/platform/include/host/performance_collector.h
index cf6a52e2b..98644abbf 100644
--- a/src/a2a3/platform/include/host/performance_collector.h
+++ b/src/a2a3/platform/include/host/performance_collector.h
@@ -332,6 +332,12 @@ class PerformanceCollector {
      */
     bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }
 
+    /**
+     * Set profiling level before initialize().
+     * 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
+     */
+    void set_perf_level(int level) { perf_level_ = level; }
+
     /**
      * Drain remaining buffers from the memory manager's ready queue
      *
@@ -387,6 +393,9 @@ class PerformanceCollector {
     PerfRegisterCallback register_cb_{nullptr};
     PerfFreeCallback free_cb_{nullptr};
 
+    // Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
+    int perf_level_{0};
+
     // Memory manager
     ProfMemoryManager memory_manager_;
 
diff --git a/src/a2a3/platform/include/host/runtime_profiling_mode.h b/src/a2a3/platform/include/host/runtime_profiling_mode.h
new file mode 100644
index 000000000..a3999a300
--- /dev/null
+++ b/src/a2a3/platform/include/host/runtime_profiling_mode.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
+ *
+ * Used by both onboard and sim pto_runtime_c_api.cpp implementations.
+ * Some runtime structs still carry a bool enable_profiling member alongside
+ * the newer int perf_level.  This template detects the legacy member at
+ * compile time and keeps both in sync.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+template <typename T, typename = void>
+struct HasEnableProfilingMember : std::false_type {};
+
+template <typename T>
+struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};
+
+template <typename R>
+static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
+    runtime->perf_level = enable_profiling;
+    if constexpr (HasEnableProfilingMember<R>::value) {
+        runtime->enable_profiling = (enable_profiling > 0);
+    }
+}
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index b3def91cc..0c9540ed9 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -461,7 +461,7 @@ int DeviceRunner::run(
     });
 
     // Initialize performance profiling if enabled
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         rc = init_performance_profiling(runtime, num_aicore, device_id);
         if (rc != 0) {
             LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -540,18 +540,18 @@ int DeviceRunner::run(
     {
         // Poll and collect performance data in a separate collector thread
         std::thread collector_thread;
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             collector_thread = create_thread([this, &runtime]() {
                 poll_and_collect_performance_data(runtime.get_task_count());
             });
         }
         auto thread_guard = RAIIScopeGuard([&]() {
-            if (runtime.enable_profiling && collector_thread.joinable()) {
+            if (runtime.perf_level > 0 && collector_thread.joinable()) {
                 collector_thread.join();
             }
         });
         auto collector_signal_guard = RAIIScopeGuard([this, &runtime]() {
-            if (runtime.enable_profiling) {
+            if (runtime.perf_level > 0) {
                 perf_collector_.signal_execution_complete();
             }
         });
@@ -588,7 +588,7 @@ int DeviceRunner::run(
     }
 
     // Stop memory management, drain remaining buffers, collect phase data, export
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         perf_collector_.stop_memory_manager();
         perf_collector_.drain_remaining_buffers();
         perf_collector_.scan_remaining_perf_buffers();
@@ -872,6 +872,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
         return rtFree(dev_ptr);
     };
 
+    perf_collector_.set_perf_level(runtime.perf_level);
     return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
 }
 
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index d672355f6..3d2369025 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -26,6 +26,7 @@
 #include "common/unified_log.h"
 #include "device_runner.h"
 #include "host/raii_scope_guard.h"
+#include "host/runtime_profiling_mode.h"
 #include "runtime.h"
 
 extern "C" {
@@ -162,9 +163,7 @@ int run_runtime(
             return rc;
         }
 
-        if (enable_profiling) {
-            r->enable_profiling = true;
-        }
+        set_runtime_profiling_mode(r, enable_profiling);
 
         std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
         std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 0979052b6..7a32eb2d4 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -314,7 +314,7 @@ int DeviceRunner::run(
     last_runtime_ = &runtime;
 
     // Initialize performance profiling if enabled
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         rc = init_performance_profiling(runtime, num_aicore, device_id);
         if (rc != 0) {
             LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -420,7 +420,7 @@ int DeviceRunner::run(
 
     // Poll and collect performance data during execution (if enabled)
     std::thread collector_thread;
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         collector_thread = create_thread([this, &runtime]() {
             poll_and_collect_performance_data(runtime.get_task_count());
         });
@@ -442,13 +442,13 @@ int DeviceRunner::run(
         }
 
         // Signal collector that device execution is complete
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             perf_collector_.signal_execution_complete();
         }
         dump_collector_.signal_execution_complete();
 
         // Wait for collector thread if it was launched
-        if (runtime.enable_profiling && collector_thread.joinable()) {
+        if (runtime.perf_level > 0 && collector_thread.joinable()) {
             collector_thread.join();
         }
         if (dump_collector_thread.joinable()) {
@@ -465,12 +465,12 @@ int DeviceRunner::run(
         }
 
         // Signal collector that device execution is complete
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             perf_collector_.signal_execution_complete();
         }
 
         // Wait for collector thread if it was launched
-        if (runtime.enable_profiling && collector_thread.joinable()) {
+        if (runtime.perf_level > 0 && collector_thread.joinable()) {
             collector_thread.join();
         }
     }
@@ -484,7 +484,7 @@ int DeviceRunner::run(
     }
 
     // Stop memory management, drain remaining buffers, collect phase data, export
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         perf_collector_.stop_memory_manager();
         perf_collector_.drain_remaining_buffers();
         perf_collector_.scan_remaining_perf_buffers();
@@ -710,6 +710,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
 // =============================================================================
 
 int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, int device_id) {
+    perf_collector_.set_perf_level(runtime.perf_level);
     // Define allocation callback (a2a3sim: use malloc)
     auto alloc_cb = [](size_t size) -> void * {
         return malloc(size);
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 3e7dfd89e..96f927c73 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -28,6 +28,7 @@
 #include "cpu_sim_context.h"
 #include "device_runner.h"
 #include "runtime.h"
+#include "host/runtime_profiling_mode.h"
 
 extern "C" {
 
@@ -154,9 +155,7 @@ int run_runtime(
         }
 
         // Phase 2: profiling
-        if (enable_profiling) {
-            r->enable_profiling = true;
-        }
+        set_runtime_profiling_mode(r, enable_profiling);
 
         // Phase 3: launch
         std::vector<uint8_t> aicpu_vec;
diff --git a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp
index 37e1f12c7..476d704a4 100644
--- a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp
+++ b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp
@@ -230,7 +230,7 @@ void perf_aicpu_switch_buffer(Runtime *runtime, int core_id, int thread_idx) {
 }
 
 void perf_aicpu_flush_buffers(Runtime *runtime, int thread_idx, const int *cur_thread_cores, int core_num) {
-    if (!runtime->enable_profiling) {
+    if (runtime->perf_level <= 0) {
         return;
     }
 
diff --git a/src/a2a3/platform/src/host/performance_collector.cpp b/src/a2a3/platform/src/host/performance_collector.cpp
index 79a3bd78b..7ee695437 100644
--- a/src/a2a3/platform/src/host/performance_collector.cpp
+++ b/src/a2a3/platform/src/host/performance_collector.cpp
@@ -77,7 +77,7 @@ void ProfMemoryManager::stop() {
 
     // Drain remaining done_queue and free buffers
     {
-        std::lock_guard<std::mutex> lock(done_mutex_);
+        std::scoped_lock lock(done_mutex_);
         while (!done_queue_.empty()) {
             CopyDoneInfo info = done_queue_.front();
             done_queue_.pop();
@@ -99,7 +99,7 @@ void ProfMemoryManager::stop() {
 }
 
 bool ProfMemoryManager::try_pop_ready(ReadyBufferInfo &info) {
-    std::lock_guard<std::mutex> lock(ready_mutex_);
+    std::scoped_lock lock(ready_mutex_);
     if (ready_queue_.empty()) {
         return false;
     }
@@ -121,7 +121,7 @@ bool ProfMemoryManager::wait_pop_ready(ReadyBufferInfo &info, std::chrono::milli
 }
 
 void ProfMemoryManager::notify_copy_done(const CopyDoneInfo &info) {
-    std::lock_guard<std::mutex> lock(done_mutex_);
+    std::scoped_lock lock(done_mutex_);
     done_queue_.push(info);
 }
 
@@ -210,7 +210,7 @@ void ProfMemoryManager::process_ready_entry(
                 host_ptr = resolve_host_ptr(new_dev_ptr);
             }
             if (new_dev_ptr == nullptr) {
-                std::lock_guard<std::mutex> lock(done_mutex_);
+                std::scoped_lock lock(done_mutex_);
                 while (!done_queue_.empty()) {
                     CopyDoneInfo dinfo = done_queue_.front();
                     done_queue_.pop();
@@ -258,7 +258,7 @@ void ProfMemoryManager::process_ready_entry(
         info.buffer_seq = seq;
 
         {
-            std::lock_guard<std::mutex> lock(ready_mutex_);
+            std::scoped_lock lock(ready_mutex_);
             ready_queue_.push(info);
         }
         ready_cv_.notify_one();
@@ -289,7 +289,7 @@ void ProfMemoryManager::process_ready_entry(
                 host_ptr = resolve_host_ptr(new_dev_ptr);
             }
             if (new_dev_ptr == nullptr) {
-                std::lock_guard<std::mutex> lock(done_mutex_);
+                std::scoped_lock lock(done_mutex_);
                 while (!done_queue_.empty()) {
                     CopyDoneInfo dinfo = done_queue_.front();
                     done_queue_.pop();
@@ -335,7 +335,7 @@ void ProfMemoryManager::process_ready_entry(
         info.buffer_seq = seq;
 
         {
-            std::lock_guard<std::mutex> lock(ready_mutex_);
+            std::scoped_lock lock(ready_mutex_);
             ready_queue_.push(info);
         }
         ready_cv_.notify_one();
@@ -348,7 +348,7 @@ void ProfMemoryManager::mgmt_loop() {
     while (running_.load()) {
         // 1. Recycle done queue: move completed buffers to recycled pools for reuse
         {
-            std::lock_guard<std::mutex> lock(done_mutex_);
+            std::scoped_lock lock(done_mutex_);
             while (!done_queue_.empty()) {
                 CopyDoneInfo info = done_queue_.front();
                 done_queue_.pop();
@@ -561,8 +561,9 @@ int PerformanceCollector::initialize(
     free_cb_ = free_cb;
 
     // Step 1: Calculate shared memory size (slot arrays only, no actual buffers)
-    int num_phase_threads = PLATFORM_MAX_AICPU_THREADS;
-    size_t total_size = calc_perf_data_size_with_phases(num_aicore, num_phase_threads);
+    int num_phase_threads = (perf_level_ >= 3) ? PLATFORM_MAX_AICPU_THREADS : 0;
+    size_t total_size = (num_phase_threads > 0) ? calc_perf_data_size_with_phases(num_aicore, num_phase_threads) :
+                                                  calc_perf_data_size(num_aicore);
 
     LOG_DEBUG("Shared memory allocation plan:");
     LOG_DEBUG("  Number of cores:      %d", num_aicore);
@@ -651,41 +652,43 @@ int PerformanceCollector::initialize(
         num_aicore * (PLATFORM_PROF_BUFFERS_PER_CORE - 1)
     );
 
-    // Step 6: Initialize PhaseBufferStates — 1 buffer per thread in free_queue, rest to recycled pool
-    for (int t = 0; t < num_phase_threads; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t);
-        memset(state, 0, sizeof(PhaseBufferState));
-
-        state->free_queue.head = 0;
-        state->free_queue.tail = 0;
-        state->current_buf_ptr = 0;
-        state->current_buf_seq = 0;
-
-        for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) {
-            void *host_buf_ptr = nullptr;
-            void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr);
-            if (dev_buf_ptr == nullptr) {
-                LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s);
-                return -1;
-            }
-            PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(host_buf_ptr);
-            memset(buf, 0, sizeof(PhaseBuffer));
-            buf->count = 0;
+    // Step 6: Initialize PhaseBufferStates (only when phase recording enabled)
+    if (num_phase_threads > 0) {
+        for (int t = 0; t < num_phase_threads; t++) {
+            PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t);
+            memset(state, 0, sizeof(PhaseBufferState));
+
+            state->free_queue.head = 0;
+            state->free_queue.tail = 0;
+            state->current_buf_ptr = 0;
+            state->current_buf_seq = 0;
+
+            for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) {
+                void *host_buf_ptr = nullptr;
+                void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr);
+                if (dev_buf_ptr == nullptr) {
+                    LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s);
+                    return -1;
+                }
+                PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(host_buf_ptr);
+                memset(buf, 0, sizeof(PhaseBuffer));
+                buf->count = 0;
 
-            if (s == 0) {
-                state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
-            } else {
-                memory_manager_.recycled_phase_buffers_.push_back(dev_buf_ptr);
+                if (s == 0) {
+                    state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
+                } else {
+                    memory_manager_.recycled_phase_buffers_.push_back(dev_buf_ptr);
+                }
             }
+            wmb();
+            state->free_queue.tail = 1;
+            wmb();
         }
-        wmb();
-        state->free_queue.tail = 1;
-        wmb();
+        LOG_DEBUG(
+            "Initialized %d PhaseBufferStates: 1 buffer/thread, %d in recycled pool", num_phase_threads,
+            num_phase_threads * (PLATFORM_PROF_BUFFERS_PER_THREAD - 1)
+        );
     }
-    LOG_DEBUG(
-        "Initialized %d PhaseBufferStates: 1 buffer/thread, %d in recycled pool", num_phase_threads,
-        num_phase_threads * (PLATFORM_PROF_BUFFERS_PER_THREAD - 1)
-    );
 
     wmb();
 
@@ -706,8 +709,8 @@ void PerformanceCollector::start_memory_manager(const ThreadFactory &thread_fact
     }
 
     memory_manager_.start(
-        perf_shared_mem_host_, num_aicore_, PLATFORM_MAX_AICPU_THREADS, alloc_cb_, register_cb_, free_cb_, device_id_,
-        thread_factory
+        perf_shared_mem_host_, num_aicore_, (perf_level_ >= 3) ? PLATFORM_MAX_AICPU_THREADS : 0, alloc_cb_,
+        register_cb_, free_cb_, device_id_, thread_factory
     );
 }
 
@@ -1245,7 +1248,17 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) {
     }
 
     // Step 7: Write JSON data
-    int version = has_phase_data_ ? 2 : 1;
+    int version;
+    if (perf_level_ <= 1) {
+        version = 0;
+    } else if (has_phase_data_) {
+        version = 2;
+    } else {
+        if (perf_level_ >= 3) {
+            LOG_WARN("perf_level=%d but no phase data collected; writing version=1", perf_level_);
+        }
+        version = 1;
+    }
     outfile << "{\n";
     outfile << "  \"version\": " << version << ",\n";
     outfile << "  \"tasks\": [\n";
@@ -1258,8 +1271,6 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) {
         double start_us = cycles_to_us(record.start_time - base_time_cycles);
         double end_us = cycles_to_us(record.end_time - base_time_cycles);
         double duration_us = end_us - start_us;
-        double dispatch_us = (record.dispatch_time > 0) ? cycles_to_us(record.dispatch_time - base_time_cycles) : 0.0;
-        double finish_us = (record.finish_time > 0) ? cycles_to_us(record.finish_time - base_time_cycles) : 0.0;
 
         const char *core_type_str = (record.core_type == CoreType::AIC) ? "aic" : "aiv";
 
@@ -1271,20 +1282,27 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) {
         outfile << "      \"ring_id\": " << static_cast<int>(record.task_id >> 32) << ",\n";
         outfile << "      \"start_time_us\": " << std::fixed << std::setprecision(3) << start_us << ",\n";
         outfile << "      \"end_time_us\": " << std::fixed << std::setprecision(3) << end_us << ",\n";
-        outfile << "      \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << ",\n";
-        outfile << "      \"dispatch_time_us\": " << std::fixed << std::setprecision(3) << dispatch_us << ",\n";
-        outfile << "      \"finish_time_us\": " << std::fixed << std::setprecision(3) << finish_us << ",\n";
-        outfile << "      \"fanout\": [";
-        int safe_fanout_count =
-            (record.fanout_count >= 0 && record.fanout_count <= RUNTIME_MAX_FANOUT) ? record.fanout_count : 0;
-        for (int j = 0; j < safe_fanout_count; ++j) {
-            outfile << record.fanout[j];
-            if (j < safe_fanout_count - 1) {
-                outfile << ", ";
+        if (perf_level_ >= 2) {
+            double dispatch_us =
+                (record.dispatch_time > 0) ? cycles_to_us(record.dispatch_time - base_time_cycles) : 0.0;
+            double finish_us = (record.finish_time > 0) ? cycles_to_us(record.finish_time - base_time_cycles) : 0.0;
+            outfile << "      \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << ",\n";
+            outfile << "      \"dispatch_time_us\": " << std::fixed << std::setprecision(3) << dispatch_us << ",\n";
+            outfile << "      \"finish_time_us\": " << std::fixed << std::setprecision(3) << finish_us << ",\n";
+            outfile << "      \"fanout\": [";
+            int safe_fanout_count =
+                (record.fanout_count >= 0 && record.fanout_count <= RUNTIME_MAX_FANOUT) ? record.fanout_count : 0;
+            for (int j = 0; j < safe_fanout_count; ++j) {
+                outfile << record.fanout[j];
+                if (j < safe_fanout_count - 1) {
+                    outfile << ", ";
+                }
             }
+            outfile << "],\n";
+            outfile << "      \"fanout_count\": " << record.fanout_count << "\n";
+        } else {
+            outfile << "      \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << "\n";
         }
-        outfile << "],\n";
-        outfile << "      \"fanout_count\": " << record.fanout_count << "\n";
         outfile << "    }";
         if (i < tagged_records.size() - 1) {
             outfile << ",";
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
index ca21bd33c..e6623dd8a 100644
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
@@ -168,6 +168,7 @@ class Runtime {
 
     // Profiling support
     bool enable_profiling;  // Enable profiling flag
+    int perf_level = 0;     // Derived from enable_profiling: 0=off, 2=task+fanout
 
     // Orchestrator-to-scheduler transition control
     // When true, orchestrator threads convert to scheduler threads after orchestration completes.
diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
index 3b4f541ba..c9ac022ee 100644
--- a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
@@ -646,7 +646,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const
 
     int verification_warning_count = 0;
     const int MAX_VERIFICATION_WARNINGS = 10;
-    bool profiling_enabled = runtime.enable_profiling;
+    bool profiling_enabled = runtime.perf_level > 0;
 
     // Extract array pointers as local variables for better readability and performance
     int *cur_ready_queue_aic = cur_ready_queue_aic_[thread_idx];
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
index 25af6e4c7..85cdf5927 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
@@ -46,6 +46,7 @@ Runtime::Runtime() {
     worker_count = 0;
     sche_cpu_num = 1;
     enable_profiling = false;
+    perf_level = 0;
     perf_data_base = 0;
     tensor_pair_count = 0;
     tensor_info_storage_ = nullptr;
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index f528acf81..30d6cd244 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -203,7 +203,8 @@ class Runtime {
     int sche_cpu_num;  // Number of AICPU threads for scheduling
 
     // Profiling support
-    bool enable_profiling;    // Enable profiling flag
+    bool enable_profiling;    // Legacy flag for host_build_graph runtime paths
+    int perf_level;           // 0=off, >0=profiling on (extended mode encoding)
     uint64_t perf_data_base;  // Performance data shared memory base address (device-side)
 
     // Task storage
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index 4113cba8b..4840a145e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -88,7 +88,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
     // Cache per-core dispatch payload pointer (set by AICPU before aicpu_ready)
     __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task);
 
-    bool profiling_enabled = runtime->enable_profiling;
+    bool profiling_enabled = (runtime->perf_level > 0);
 
     // Phase 4: Main execution loop - poll register for tasks until exit signal
     // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index cc4fe8ab8..c2a39182b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -479,7 +479,7 @@ struct AicpuExecutor {
         PTO2LocalReadyBuffer *local_bufs, CoreType ct
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_complete_count, uint64_t dispatch_ts
+        bool task_recording_enabled, int perf_level, uint32_t &phase_complete_count, uint64_t dispatch_ts
 #endif
 #if PTO2_SCHED_PROFILING
         ,
@@ -542,26 +542,31 @@ struct AicpuExecutor {
         }
 
 #if PTO2_PROFILING
-        if (profiling_enabled) {
+        if (task_recording_enabled) {
 #if PTO2_SCHED_PROFILING
             uint64_t t_perf_start = get_sys_cnt_aicpu();
 #endif
             Handshake *h = &hank[core_id];
-            uint64_t finish_ts = get_sys_cnt_aicpu();
             PerfBuffer *perf_buf = reinterpret_cast<PerfBuffer *>(h->perf_records_addr);
 
             uint64_t fanout_arr[RUNTIME_MAX_FANOUT];
             int32_t fanout_n = 0;
-            PTO2DepListEntry *cur = slot_state.fanout_head;
-            while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) {
-                fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw;
-                cur = cur->next;
+            uint64_t finish_ts = 0;
+
+            if (perf_level >= 2) {
+                finish_ts = get_sys_cnt_aicpu();
+                PTO2DepListEntry *cur = slot_state.fanout_head;
+                while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) {
+                    fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw;
+                    cur = cur->next;
+                }
             }
 
             int32_t perf_slot_idx = static_cast<int32_t>(subslot);
             if (perf_aicpu_complete_record(
                     perf_buf, static_cast<uint32_t>(expected_reg_task_id), slot_state.task->task_id.raw,
-                    slot_state.task->kernel_id[perf_slot_idx], ct, dispatch_ts, finish_ts, fanout_arr, fanout_n
+                    slot_state.task->kernel_id[perf_slot_idx], ct, dispatch_ts, finish_ts,
+                    (perf_level >= 2) ? fanout_arr : nullptr, fanout_n
                 ) != 0) {
                 DEV_ERROR(
                     "Core %d: perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id,
@@ -600,7 +605,7 @@ struct AicpuExecutor {
         PTO2LocalReadyBuffer *local_bufs
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_complete_count
+        bool task_recording_enabled, int perf_level, uint32_t &phase_complete_count
 #endif
 #if PTO2_SCHED_PROFILING
         ,
@@ -622,7 +627,7 @@ struct AicpuExecutor {
             int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
 
 #if PTO2_SCHED_PROFILING
-            if (profiling_enabled) {
+            if (task_recording_enabled) {
                 complete_probe_count++;
             }
 #endif
@@ -632,7 +637,7 @@ struct AicpuExecutor {
             if (!t.matched) continue;
 
 #if PTO2_SCHED_PROFILING
-            if (profiling_enabled && (t.running_done || t.pending_done)) {
+            if (task_recording_enabled && (t.running_done || t.pending_done)) {
                 complete_hit_count++;
             }
 #endif
@@ -646,7 +651,7 @@ struct AicpuExecutor {
                     completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs, CT
 #if PTO2_PROFILING
                     ,
-                    profiling_enabled, phase_complete_count, core.pending_dispatch_timestamp
+                    task_recording_enabled, perf_level, phase_complete_count, core.pending_dispatch_timestamp
 #endif
 #if PTO2_SCHED_PROFILING
                     ,
@@ -662,7 +667,7 @@ struct AicpuExecutor {
                     completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs, CT
 #if PTO2_PROFILING
                     ,
-                    profiling_enabled, phase_complete_count, core.running_dispatch_timestamp
+                    task_recording_enabled, perf_level, phase_complete_count, core.running_dispatch_timestamp
 #endif
 #if PTO2_SCHED_PROFILING
                     ,
@@ -809,7 +814,7 @@ struct AicpuExecutor {
         PTO2SubtaskSlot subslot, bool to_pending
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled
+        bool task_recording_enabled
 #endif
     ) {
         CoreTracker &tracker = core_trackers_[thread_idx];
@@ -849,7 +854,7 @@ struct AicpuExecutor {
             core_exec_state.pending_slot_state = &slot_state;
             core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
 #if PTO2_PROFILING
-            if (profiling_enabled) {
+            if (task_recording_enabled && runtime->perf_level >= 2) {
                 core_exec_state.pending_dispatch_timestamp = get_sys_cnt_aicpu();
             }
 #endif
@@ -858,7 +863,7 @@ struct AicpuExecutor {
             core_exec_state.running_slot_state = &slot_state;
             core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
 #if PTO2_PROFILING
-            if (profiling_enabled) {
+            if (task_recording_enabled && runtime->perf_level >= 2) {
                 core_exec_state.running_dispatch_timestamp = get_sys_cnt_aicpu();
             }
 #endif
@@ -866,7 +871,7 @@ struct AicpuExecutor {
             tracker.change_core_state(core_offset);
         }
 #if PTO2_PROFILING
-        if (profiling_enabled) {
+        if (task_recording_enabled) {
             if (core_exec_state.dispatch_count >= PLATFORM_PROF_BUFFER_SIZE) {
                 perf_aicpu_switch_buffer(runtime, core_id, thread_idx);
                 core_exec_state.dispatch_count = 0;
@@ -888,7 +893,7 @@ struct AicpuExecutor {
         Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled
+        bool task_recording_enabled
 #endif
     ) {
         CoreTracker &tracker = core_trackers_[thread_idx];
@@ -899,7 +904,7 @@ struct AicpuExecutor {
                 false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         }
@@ -909,7 +914,7 @@ struct AicpuExecutor {
                 false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         }
@@ -919,7 +924,7 @@ struct AicpuExecutor {
                 false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         }
@@ -959,7 +964,7 @@ struct AicpuExecutor {
         PTO2ResourceShape shape
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_dispatch_count
+        bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count
 #endif
     ) {
         CoreTracker &tracker = core_trackers_[thread_idx];
@@ -981,7 +986,7 @@ struct AicpuExecutor {
                 runtime, thread_idx, cluster_offset, slot_state
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         } else if (shape == PTO2ResourceShape::AIC) {
@@ -990,7 +995,7 @@ struct AicpuExecutor {
                 false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         } else {  // AIV
@@ -1001,12 +1006,14 @@ struct AicpuExecutor {
                 runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         }
 #if PTO2_PROFILING
-        phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask));
+        if (phase_recording_enabled) {
+            phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask));
+        }
 #endif
     }
 
@@ -1030,7 +1037,7 @@ struct AicpuExecutor {
         Runtime *runtime, int32_t block_num
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_dispatch_count
+        bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count
 #endif
     ) {
         PTO2TaskSlotState *slot_state = drain_state_.pending_task;
@@ -1047,7 +1054,7 @@ struct AicpuExecutor {
                     runtime, t, valid.pop_first(), *slot_state, shape
 #if PTO2_PROFILING
                     ,
-                    profiling_enabled, phase_dispatch_count
+                    task_recording_enabled, phase_recording_enabled, phase_dispatch_count
 #endif
                 );
                 slot_state->next_block_idx++;
@@ -1082,7 +1089,7 @@ struct AicpuExecutor {
         Runtime *runtime, int32_t thread_idx
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_dispatch_count
+        bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count
 #endif
     ) {
         // Spin until drain is fully initialized (sentinel -1 → block_num > 0).
@@ -1139,7 +1146,7 @@ struct AicpuExecutor {
             runtime, block_num
 #if PTO2_PROFILING
             ,
-            profiling_enabled, phase_dispatch_count
+            task_recording_enabled, phase_recording_enabled, phase_dispatch_count
 #endif
         );
     }
@@ -1532,9 +1539,10 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
 #if PTO2_PROFILING
         // Assign perf buffers to cores early so profiling captures all tasks
         // (total_tasks written to header later when orchestrator completes)
-        if (runtime->enable_profiling) {
+        if (runtime->perf_level > 0) {
             perf_aicpu_init_profiling(runtime);
-            // Initialize phase profiling for scheduler threads + orchestrator threads
+        }
+        if (runtime->perf_level >= 3) {
             perf_aicpu_init_phase_profiling(runtime, sched_thread_num_);
             perf_aicpu_set_orch_thread_idx(sched_thread_num_);
         }
@@ -1558,7 +1566,9 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
     int32_t idle_iterations = 0;
     int32_t last_progress_count = 0;
 #if PTO2_PROFILING
-    bool profiling_enabled = runtime->enable_profiling;
+    int perf_level = runtime->perf_level;
+    bool task_recording_enabled = (perf_level > 0);
+    bool phase_recording_enabled = (perf_level >= 3);
 #endif
 
     // Scheduler profiling counters
@@ -1681,7 +1691,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                 deferred_release_slot_states, deferred_release_count, local_bufs
 #if PTO2_PROFILING
                 ,
-                profiling_enabled, phase_complete_count
+                task_recording_enabled, perf_level, phase_complete_count
 #endif
 #if PTO2_SCHED_PROFILING
                 ,
@@ -1699,7 +1709,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                 deferred_release_slot_states, deferred_release_count, local_bufs
 #if PTO2_PROFILING
                 ,
-                profiling_enabled, phase_complete_count
+                task_recording_enabled, perf_level, phase_complete_count
 #endif
 #if PTO2_SCHED_PROFILING
                 ,
@@ -1731,7 +1741,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
             CYCLE_COUNT_LAP(sched_idle_cycle);
         } else {
             CYCLE_COUNT_LAP(sched_complete_cycle);
-            if (profiling_enabled && phase_complete_count > 0) {
+            if (phase_recording_enabled && phase_complete_count > 0) {
                 perf_aicpu_record_phase(
                     thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, sched_loop_count, phase_complete_count
                 );
@@ -1750,7 +1760,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                 runtime, thread_idx
 #if PTO2_PROFILING
                 ,
-                profiling_enabled, phase_dispatch_count
+                task_recording_enabled, phase_recording_enabled, phase_dispatch_count
 #endif
             );
             continue;
@@ -1840,7 +1850,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                             runtime, thread_idx, current_valid_cluster_offset, *slot_state, shape
 #if PTO2_PROFILING
                             ,
-                            profiling_enabled, phase_dispatch_count
+                            task_recording_enabled, phase_recording_enabled, phase_dispatch_count
 #endif
                         );
                         slot_state->next_block_idx++;
@@ -1910,7 +1920,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                             PTO2SubtaskSlot::AIC, true
 #if PTO2_PROFILING
                             ,
-                            profiling_enabled
+                            task_recording_enabled
 #endif
                         );
                         slot_state->next_block_idx++;
@@ -1948,7 +1958,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
             CYCLE_COUNT_LAP(sched_idle_cycle);
         } else {
             CYCLE_COUNT_LAP(sched_dispatch_cycle);
-            if (profiling_enabled && phase_dispatch_count > 0) {
+            if (phase_recording_enabled && phase_dispatch_count > 0) {
                 perf_aicpu_record_phase(
                     thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, sched_loop_count, phase_dispatch_count
                 );
@@ -2108,7 +2118,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
             }
 #if PTO2_PROFILING
             CYCLE_COUNT_LAP(sched_idle_cycle);
-            if (profiling_enabled) {
+            if (phase_recording_enabled) {
                 perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, sched_loop_count, 0);
                 _t0_phase = _t1;
             }
@@ -2275,9 +2285,10 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
 #endif
 
 #if PTO2_PROFILING
-    // Flush performance buffers for cores managed by this thread
-    if (profiling_enabled) {
+    if (task_recording_enabled) {
         perf_aicpu_flush_buffers(runtime, thread_idx, core_assignments_[thread_idx], core_num);
+    }
+    if (phase_recording_enabled) {
         perf_aicpu_flush_phase_buffers(thread_idx);
     }
 #endif
@@ -2479,7 +2490,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
 
 #if PTO2_PROFILING
-            rt->orchestrator.enable_profiling = runtime->enable_profiling;
+            rt->orchestrator.perf_level = runtime->perf_level;
 #endif
 
             // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
@@ -2503,7 +2514,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
 
 #if PTO2_PROFILING
-            if (runtime->enable_profiling) {
+            if (runtime->perf_level >= 3) {
                 perf_aicpu_set_orch_thread_idx(thread_idx);
             }
 #endif
@@ -2588,7 +2599,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
 #if PTO2_PROFILING
             // Write orchestrator summary to shared memory for host-side export (only if profiling enabled)
-            if (runtime->enable_profiling) {
+            if (runtime->perf_level >= 3) {
                 AicpuOrchSummary orch_summary = {};
                 orch_summary.start_time = orch_cycle_start;
                 orch_summary.end_time = orch_cycle_end;
@@ -2608,7 +2619,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
 #if PTO2_PROFILING
             // Write core-to-thread mapping (one-time, after orchestration)
-            if (runtime->enable_profiling) {
+            if (runtime->perf_level >= 3) {
                 perf_aicpu_write_core_assignments(
                     core_assignments_, core_count_per_thread_, sched_thread_num_, cores_total_num_
                 );
@@ -2632,7 +2643,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             pto2_submitted_tasks = pto2_task_count;
 #endif
             total_tasks_ = pto2_task_count;
-            if (runtime->enable_profiling && pto2_task_count > 0) {
+            if (runtime->perf_level > 0 && pto2_task_count > 0) {
                 perf_aicpu_update_total_tasks(runtime, static_cast<uint32_t>(pto2_task_count));
             }
             int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 85420e245..558b575f9 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -95,8 +95,8 @@ __attribute__((weak, visibility("hidden"))) void
 perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
 // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
 static uint32_t g_orch_submit_idx = 0;
-#define CYCLE_COUNT_START()                     \
-    bool _prof_active = orch->enable_profiling; \
+#define CYCLE_COUNT_START()                      \
+    bool _prof_active = (orch->perf_level >= 3); \
     uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0
 #define CYCLE_COUNT_LAP(acc) \
     do {                     \
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 0ad5e6873..3089a0442 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -75,8 +75,7 @@ struct PTO2OrchestratorState {
     int32_t total_cluster_count{0};  // AIC cores = MIX clusters
     int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
 #if PTO2_PROFILING
-    // Runtime profiling switch copied from Runtime::enable_profiling.
-    bool enable_profiling;
+    int perf_level;
 #endif
 
     // === GM HEAP (for output buffers) ===
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 2d0e0b4b4..2b0fff2e2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -171,7 +171,7 @@ class Runtime {
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
     // Profiling support
-    bool enable_profiling;  // Enable profiling flag
+    int perf_level = 0;  // 0=off, 1=AICore-only, 2=task+fanout, 3=full
 
     // Orchestrator-to-scheduler transition control
     // When true, orchestrator threads convert to scheduler threads after orchestration completes.
diff --git a/src/common/distributed/dist_chip_process.cpp b/src/common/distributed/dist_chip_process.cpp
index 3bbe6e003..e79a4c9cb 100644
--- a/src/common/distributed/dist_chip_process.cpp
+++ b/src/common/distributed/dist_chip_process.cpp
@@ -56,7 +56,7 @@ void DistChipProcess::run(uint64_t callable, TaskArgsView args, const ChipCallCo
     // Write config fields.
     int32_t block_dim = config.block_dim;
     int32_t aicpu_tn = config.aicpu_thread_num;
-    int32_t profiling = config.enable_profiling ? 1 : 0;
+    int32_t profiling = config.perf_level;
     std::memcpy(base() + OFF_BLOCK_DIM, &block_dim, sizeof(int32_t));
     std::memcpy(base() + OFF_AICPU_THREAD_NUM, &aicpu_tn, sizeof(int32_t));
     std::memcpy(base() + OFF_ENABLE_PROFILING, &profiling, sizeof(int32_t));
diff --git a/src/common/task_interface/chip_call_config.h b/src/common/task_interface/chip_call_config.h
index 1f1eb7721..1b8e30b81 100644
--- a/src/common/task_interface/chip_call_config.h
+++ b/src/common/task_interface/chip_call_config.h
@@ -11,7 +11,7 @@
 
 /**
  * ChipCallConfig — per-NEXT_LEVEL-task config (block_dim, aicpu_thread_num,
- * enable_profiling). Lives here (rather than chip_worker.h) so distributed
+ * enable_profiling/perf_level). Lives here (rather than chip_worker.h) so distributed
  * task slot state can store it directly without pulling in the full
  * ChipWorker header (which depends on dist_types.h).
  */
@@ -21,6 +21,6 @@
 struct ChipCallConfig {
     int block_dim = 24;
     int aicpu_thread_num = 3;
-    bool enable_profiling = false;
+    int perf_level = 0;  // 0=off, 1=AICore-only, 2=task+fanout, 3=full
     bool enable_dump_tensor = false;
 };
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index e919fcc27..3a3a7fa32 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -205,7 +205,7 @@ void ChipWorker::run(const void *callable, const void *args, const ChipCallConfi
 
     int rc = run_runtime_fn_(
         device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(),
-        aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_profiling ? 1 : 0,
+        aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.perf_level,
         config.enable_dump_tensor ? 1 : 0
     );
     if (rc != 0) {
diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py
index 59b85f985..8a3947e3b 100644
--- a/tests/ut/py/test_chip_worker.py
+++ b/tests/ut/py/test_chip_worker.py
@@ -31,7 +31,7 @@ def test_defaults(self):
         config = ChipCallConfig()
         assert config.block_dim == 24
         assert config.aicpu_thread_num == 3
-        assert config.enable_profiling is False
+        assert config.enable_profiling == 0
 
     def test_setters(self):
         config = ChipCallConfig()
@@ -40,13 +40,13 @@ def test_setters(self):
         config.enable_profiling = True
         assert config.block_dim == 32
         assert config.aicpu_thread_num == 4
-        assert config.enable_profiling is True
+        assert config.enable_profiling > 0
 
     def test_repr(self):
         config = ChipCallConfig()
         r = repr(config)
         assert "block_dim=24" in r
-        assert "enable_profiling=False" in r
+        assert "enable_profiling=0" in r
 
 
 # ============================================================================
diff --git a/tools/swimlane_converter.py b/tools/swimlane_converter.py
index 9acd7ab9a..1aaa9f69f 100644
--- a/tools/swimlane_converter.py
+++ b/tools/swimlane_converter.py
@@ -120,8 +120,8 @@ def read_perf_data(filepath):
             raise ValueError(f"Missing required field: {field}")
 
     # Validate version
-    if data["version"] not in [1, 2]:
-        raise ValueError(f"Unsupported version: {data['version']} (expected 1 or 2)")
+    if data["version"] not in [0, 1, 2]:
+        raise ValueError(f"Unsupported version: {data['version']} (expected 0/1/2)")
 
     return data
 
@@ -482,7 +482,8 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         dur = task["duration_us"]
 
         # Build fanout hint string (packed ids → rXtY / tY for readability)
-        fanout_str = "[" + ", ".join(format_task_display(x) for x in task["fanout"]) + "]"
+        fanout_list = task.get("fanout", [])
+        fanout_str = "[" + ", ".join(format_task_display(x) for x in fanout_list) + "]"
 
         # Get function name if available
         func_id = task["func_id"]
@@ -632,7 +633,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         src_tid = core_to_tid[task["core_id"]]
         src_ts_end = task["end_time_us"]
 
-        for succ_task_id in task["fanout"]:
+        for succ_task_id in task.get("fanout", []):
             if succ_task_id not in task_map:
                 if verbose:
                     print(
@@ -864,7 +865,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
             src_tid = task_to_aicpu_tid.get(task["task_id"], core_to_tid[task["core_id"]])
             src_aicpu_eid = task_to_aicpu_event_id.get(task["task_id"])
 
-            for succ_task_id in task["fanout"]:
+            for succ_task_id in task.get("fanout", []):
                 if succ_task_id not in task_map:
                     continue