hw-native-sys
diff --git a/‎conftest.py‎
Lines changed: 6 additions & 1 deletion b/‎conftest.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎examples/scripts/run_example.py‎
Lines changed: 6 additions & 2 deletions b/‎examples/scripts/run_example.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎python/bindings/task_interface.cpp‎
Lines changed: 17 additions & 5 deletions b/‎python/bindings/task_interface.cpp‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎python/simpler/worker.py‎
Lines changed: 1 addition & 1 deletion b/‎python/simpler/worker.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎simpler_setup/code_runner.py‎
Lines changed: 11 additions & 5 deletions b/‎simpler_setup/code_runner.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎simpler_setup/scene_test.py‎
Lines changed: 12 additions & 4 deletions b/‎simpler_setup/scene_test.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎src/a2a3/platform/include/host/performance_collector.h‎
Lines changed: 9 additions & 0 deletions b/‎src/a2a3/platform/include/host/performance_collector.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/a2a3/platform/include/host/runtime_profiling_mode.h‎
Lines changed: 37 additions & 0 deletions b/‎src/a2a3/platform/include/host/runtime_profiling_mode.h‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎src/a2a3/platform/onboard/host/device_runner.cpp‎
Lines changed: 6 additions & 5 deletions b/‎src/a2a3/platform/onboard/host/device_runner.cpp‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp‎
Lines changed: 2 additions & 3 deletions b/‎src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp‎
Lines changed: 2 additions & 3 deletions
@@ -67,7 +67,12 @@ def pytest_addoption(parser):
         "--skip-golden", action="store_true", default=False, help="Skip golden comparison (benchmark mode)"
     )
     parser.addoption(
-        "--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"
+        "--enable-profiling",
+        type=int,
+        nargs="?",
+        const=3,
+        default=0,
+        help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
     )
     parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")
 
 
@@ -139,8 +139,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
 
     parser.add_argument(
         "--enable-profiling",
-        action="store_true",
-        help="Enable profiling and generate swimlane.json",
+        type=int,
+        nargs="?",
+        const=3,
+        default=0,
+        metavar="LEVEL",
+        help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
     )
 
     parser.add_argument(
 
@@ -540,11 +540,23 @@ NB_MODULE(_task_interface, m) {
         .def(nb::init<>())
         .def_rw("block_dim", &ChipCallConfig::block_dim)
         .def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
-        .def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
+        .def_prop_rw(
+            "enable_profiling",
+            [](const ChipCallConfig &self) {
+                return self.perf_level;
+            },
+            [](ChipCallConfig &self, nb::object v) {
+                if (nb::isinstance<nb::bool_>(v)) {
+                    self.perf_level = nb::cast<bool>(v) ? 3 : 0;
+                } else {
+                    self.perf_level = nb::cast<int>(v);
+                }
+            }
+        )
         .def("__repr__", [](const ChipCallConfig &self) -> std::string {
             std::ostringstream os;
             os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
-               << ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")";
+               << ", enable_profiling=" << self.perf_level << ")";
             return os.str();
         });
 
@@ -569,15 +581,15 @@ NB_MODULE(_task_interface, m) {
         .def(
             "run_raw",
             [](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
-               bool enable_profiling) {
+               int perf_level) {
                 ChipCallConfig config;
                 config.block_dim = block_dim;
                 config.aicpu_thread_num = aicpu_thread_num;
-                config.enable_profiling = enable_profiling;
+                config.perf_level = perf_level;
                 self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
             },
             nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
-            nb::arg("enable_profiling") = false, "Run with raw pointer arguments (used from forked chip process)."
+            nb::arg("perf_level") = 0, "Run with raw pointer arguments (used from forked chip process)."
         )
         .def_prop_ro("device_id", &ChipWorker::device_id)
         .def_prop_ro("initialized", &ChipWorker::initialized)
 
@@ -166,7 +166,7 @@ def _chip_process_loop(
 
             error = 0
             try:
-                cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, bool(profiling))
+                cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, profiling)
             except Exception:  # noqa: BLE001
                 error = 1
             struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)
 
@@ -123,6 +123,12 @@ def _load_module_from_path(module_path: Path, module_name: str):
     return module
 
 
+def _normalize_perf_level(v) -> int:
+    if isinstance(v, bool):
+        return 3 if v else 0
+    return int(v)
+
+
 def _kernel_config_runtime_env(kernel_config_module, kernels_dir: Path) -> dict[str, str]:
     """
     Optional per-example environment variables for runtime compilation.
@@ -192,7 +198,7 @@ def __init__(  # noqa: PLR0913
         golden_path: str,
         device_id: Optional[int] = None,
         platform: str = "a2a3",
-        enable_profiling: bool = False,
+        enable_profiling: int = 0,
         run_all_cases: bool = False,
         case_name: Optional[str] = None,
         pto_isa_commit: Optional[str] = None,
@@ -211,7 +217,7 @@ def __init__(  # noqa: PLR0913
         self.kernels_dir = Path(kernels_dir).resolve()
         self.golden_path = Path(golden_path).resolve()
         self.platform = platform
-        self.enable_profiling = enable_profiling
+        self._perf_level = _normalize_perf_level(enable_profiling)
         self.skip_golden = skip_golden
         self.project_root = PROJECT_ROOT
 
@@ -605,9 +611,9 @@ def _compile_one_kernel(kernel):
                 config = ChipCallConfig()
                 config.block_dim = self.block_dim
                 config.aicpu_thread_num = self.aicpu_thread_num
-                if self.enable_profiling and round_idx == 0:
-                    config.enable_profiling = True
-                    logger.info("Profiling enabled")
+                if self._perf_level > 0 and round_idx == 0:
+                    config.enable_profiling = self._perf_level
+                    logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")
 
                 with _temporary_env(run_env):
                     worker.run(chip_callable, orch_args, config)
 
@@ -499,7 +499,7 @@ def build_callable(self, platform):
             return self._compile_l3_callables(platform)
         raise ValueError(f"Unsupported level: {self._st_level}")
 
-    def _build_config(self, config_dict, enable_profiling=False):
+    def _build_config(self, config_dict, enable_profiling=0):
         from simpler.task_interface import ChipCallConfig  # noqa: PLC0415
 
         config = ChipCallConfig()
@@ -575,7 +575,7 @@ def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden
                 for name, initial in initial_outputs.items():
                     getattr(test_args, name).copy_(initial)
 
-            config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0))
+            config = self._build_config(config_dict, enable_profiling=(enable_profiling if round_idx == 0 else 0))
 
             with _temporary_env(self._resolve_env()):
                 worker.run(callable_obj, chip_args, config=config)
@@ -619,7 +619,7 @@ def _run_and_validate_l3(
                 for name, initial in initial_tensors.items():
                     getattr(test_args, name).copy_(initial)
 
-            config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0))
+            config = self._build_config(config_dict, enable_profiling=(enable_profiling if round_idx == 0 else 0))
 
             # Wrap in Task — user orch signature: (orch, callables, task_args, config)
             def task_orch(orch, _unused, _ns=ns, _test_args=test_args, _config=config):
@@ -685,7 +685,15 @@ def run_module(module_name):
         parser.add_argument("--all-cases", action="store_true", help="Include manual cases")
         parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)")
         parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)")
-        parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)")
+        parser.add_argument(
+            "--enable-profiling",
+            type=int,
+            nargs="?",
+            const=3,
+            default=0,
+            metavar="LEVEL",
+            help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
+        )
         parser.add_argument("--build", action="store_true", help="Compile runtime from source")
         parser.add_argument(
             "--log-level",
 
@@ -332,6 +332,12 @@ class PerformanceCollector {
      */
     bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }
 
+    /**
+     * Set profiling level before initialize().
+     * 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
+     */
+    void set_perf_level(int level) { perf_level_ = level; }
+
     /**
      * Drain remaining buffers from the memory manager's ready queue
      *
@@ -387,6 +393,9 @@ class PerformanceCollector {
     PerfRegisterCallback register_cb_{nullptr};
     PerfFreeCallback free_cb_{nullptr};
 
+    // Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
+    int perf_level_{0};
+
     // Memory manager
     ProfMemoryManager memory_manager_;
 
 
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
+ *
+ * Used by both onboard and sim pto_runtime_c_api.cpp implementations.
+ * Some runtime structs still carry a bool enable_profiling member alongside
+ * the newer int perf_level.  This template detects the legacy member at
+ * compile time and keeps both in sync.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+template <typename T, typename = void>
+struct HasEnableProfilingMember : std::false_type {};
+
+template <typename T>
+struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};
+
+template <typename R>
+static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
+    runtime->perf_level = enable_profiling;
+    if constexpr (HasEnableProfilingMember<R>::value) {
+        runtime->enable_profiling = (enable_profiling > 0);
+    }
+}
@@ -461,7 +461,7 @@ int DeviceRunner::run(
     });
 
     // Initialize performance profiling if enabled
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         rc = init_performance_profiling(runtime, num_aicore, device_id);
         if (rc != 0) {
             LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -530,13 +530,13 @@ int DeviceRunner::run(
     {
         // Poll and collect performance data in a separate collector thread
         std::thread collector_thread;
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             collector_thread = create_thread([this, &runtime]() {
                 poll_and_collect_performance_data(runtime.get_task_count());
             });
         }
         auto thread_guard = RAIIScopeGuard([&]() {
-            if (runtime.enable_profiling && collector_thread.joinable()) {
+            if (runtime.perf_level > 0 && collector_thread.joinable()) {
                 collector_thread.join();
             }
         });
@@ -557,13 +557,13 @@ int DeviceRunner::run(
         }
 
         // Signal collector that device execution is complete
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             perf_collector_.signal_execution_complete();
         }
     }
 
     // Stop memory management, drain remaining buffers, collect phase data, export
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         perf_collector_.stop_memory_manager();
         perf_collector_.drain_remaining_buffers();
         perf_collector_.scan_remaining_perf_buffers();
@@ -822,6 +822,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
         return rtFree(dev_ptr);
     };
 
+    perf_collector_.set_perf_level(runtime.perf_level);
     return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
 }
 
 
@@ -26,6 +26,7 @@
 #include "common/unified_log.h"
 #include "device_runner.h"
 #include "host/raii_scope_guard.h"
+#include "host/runtime_profiling_mode.h"
 #include "runtime.h"
 
 extern "C" {
@@ -162,9 +163,7 @@ int run_runtime(
             return rc;
         }
 
-        if (enable_profiling) {
-            r->enable_profiling = true;
-        }
+        set_runtime_profiling_mode(r, enable_profiling);
 
         std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
         std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,12 @@ def pytest_addoption(parser):`
`67`	`67`	`"--skip-golden", action="store_true", default=False, help="Skip golden comparison (benchmark mode)"`
`68`	`68`	`)`
`69`	`69`	`parser.addoption(`
`70`		`- "--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"`
	`70`	`+ "--enable-profiling",`
	`71`	`+ type=int,`
	`72`	`+ nargs="?",`
	`73`	`+ const=3,`
	`74`	`+ default=0,`
	`75`	`+ help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",`
`71`	`76`	`)`
`72`	`77`	`parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")`
`73`	`78`