Skip to content

Commit 5b10ea4

Browse files
committed
Refactor: introduce tiered profiling levels for a2a3 tensormap_and_ringbuffer swimlane export
Replace the boolean enable_profiling flag with an integer perf_level throughout the profiling pipeline (CLI → Python bindings → C++ runtime → AICPU executor → PerformanceCollector → JSON export). Profiling levels: 0 = off 1 = AICore task start/end timing only (JSON version 0) 2 = + dispatch timestamps, finish timestamps, fanout edges (JSON version 1) 3 = + AICPU scheduler/orchestrator phase buffers (JSON version 2) Key changes: - ChipCallConfig: bool enable_profiling → int perf_level - CLI --enable-profiling: store_true → optional int (bare flag defaults to 3) - nanobind property: backward-compatible bool→3 coercion for legacy callers - AICPU executor: split into task_recording_enabled (>0) vs phase_recording_enabled (>=3) to skip phase overhead at lower levels - PerformanceCollector: skip phase buffer allocation when perf_level < 3; version selection based on perf_level and presence of phase data - swimlane_converter.py: accept version 0, tolerate missing fanout field - Fix scene_test.py: `val and cond` truncated int to bool; use ternary
1 parent 448d024 commit 5b10ea4

29 files changed

Lines changed: 276 additions & 163 deletions

File tree

conftest.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,12 @@ def pytest_addoption(parser):
6767
"--skip-golden", action="store_true", default=False, help="Skip golden comparison (benchmark mode)"
6868
)
6969
parser.addoption(
70-
"--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"
70+
"--enable-profiling",
71+
type=int,
72+
nargs="?",
73+
const=3,
74+
default=0,
75+
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
7176
)
7277
parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")
7378

examples/scripts/run_example.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
139139

140140
parser.add_argument(
141141
"--enable-profiling",
142-
action="store_true",
143-
help="Enable profiling and generate swimlane.json",
142+
type=int,
143+
nargs="?",
144+
const=3,
145+
default=0,
146+
metavar="LEVEL",
147+
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
144148
)
145149

146150
parser.add_argument(

python/bindings/task_interface.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -540,11 +540,23 @@ NB_MODULE(_task_interface, m) {
540540
.def(nb::init<>())
541541
.def_rw("block_dim", &ChipCallConfig::block_dim)
542542
.def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
543-
.def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
543+
.def_prop_rw(
544+
"enable_profiling",
545+
[](const ChipCallConfig &self) {
546+
return self.perf_level;
547+
},
548+
[](ChipCallConfig &self, nb::object v) {
549+
if (nb::isinstance<nb::bool_>(v)) {
550+
self.perf_level = nb::cast<bool>(v) ? 3 : 0;
551+
} else {
552+
self.perf_level = nb::cast<int>(v);
553+
}
554+
}
555+
)
544556
.def("__repr__", [](const ChipCallConfig &self) -> std::string {
545557
std::ostringstream os;
546558
os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
547-
<< ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")";
559+
<< ", enable_profiling=" << self.perf_level << ")";
548560
return os.str();
549561
});
550562

@@ -569,15 +581,15 @@ NB_MODULE(_task_interface, m) {
569581
.def(
570582
"run_raw",
571583
[](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
572-
bool enable_profiling) {
584+
int perf_level) {
573585
ChipCallConfig config;
574586
config.block_dim = block_dim;
575587
config.aicpu_thread_num = aicpu_thread_num;
576-
config.enable_profiling = enable_profiling;
588+
config.perf_level = perf_level;
577589
self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
578590
},
579591
nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
580-
nb::arg("enable_profiling") = false, "Run with raw pointer arguments (used from forked chip process)."
592+
nb::arg("perf_level") = 0, "Run with raw pointer arguments (used from forked chip process)."
581593
)
582594
.def_prop_ro("device_id", &ChipWorker::device_id)
583595
.def_prop_ro("initialized", &ChipWorker::initialized)

python/simpler/worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def _chip_process_loop(
166166

167167
error = 0
168168
try:
169-
cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, bool(profiling))
169+
cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, profiling)
170170
except Exception: # noqa: BLE001
171171
error = 1
172172
struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)

simpler_setup/code_runner.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ def _load_module_from_path(module_path: Path, module_name: str):
123123
return module
124124

125125

126+
def _normalize_perf_level(v) -> int:
127+
if isinstance(v, bool):
128+
return 3 if v else 0
129+
return int(v)
130+
131+
126132
def _kernel_config_runtime_env(kernel_config_module, kernels_dir: Path) -> dict[str, str]:
127133
"""
128134
Optional per-example environment variables for runtime compilation.
@@ -192,7 +198,7 @@ def __init__( # noqa: PLR0913
192198
golden_path: str,
193199
device_id: Optional[int] = None,
194200
platform: str = "a2a3",
195-
enable_profiling: bool = False,
201+
enable_profiling: int = 0,
196202
run_all_cases: bool = False,
197203
case_name: Optional[str] = None,
198204
pto_isa_commit: Optional[str] = None,
@@ -211,7 +217,7 @@ def __init__( # noqa: PLR0913
211217
self.kernels_dir = Path(kernels_dir).resolve()
212218
self.golden_path = Path(golden_path).resolve()
213219
self.platform = platform
214-
self.enable_profiling = enable_profiling
220+
self._perf_level = _normalize_perf_level(enable_profiling)
215221
self.skip_golden = skip_golden
216222
self.project_root = PROJECT_ROOT
217223

@@ -605,9 +611,9 @@ def _compile_one_kernel(kernel):
605611
config = ChipCallConfig()
606612
config.block_dim = self.block_dim
607613
config.aicpu_thread_num = self.aicpu_thread_num
608-
if self.enable_profiling and round_idx == 0:
609-
config.enable_profiling = True
610-
logger.info("Profiling enabled")
614+
if self._perf_level > 0 and round_idx == 0:
615+
config.enable_profiling = self._perf_level
616+
logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")
611617

612618
with _temporary_env(run_env):
613619
worker.run(chip_callable, orch_args, config)

simpler_setup/scene_test.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,7 @@ def build_callable(self, platform):
499499
return self._compile_l3_callables(platform)
500500
raise ValueError(f"Unsupported level: {self._st_level}")
501501

502-
def _build_config(self, config_dict, enable_profiling=False):
502+
def _build_config(self, config_dict, enable_profiling=0):
503503
from simpler.task_interface import ChipCallConfig # noqa: PLC0415
504504

505505
config = ChipCallConfig()
@@ -575,7 +575,7 @@ def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden
575575
for name, initial in initial_outputs.items():
576576
getattr(test_args, name).copy_(initial)
577577

578-
config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0))
578+
config = self._build_config(config_dict, enable_profiling=(enable_profiling if round_idx == 0 else 0))
579579

580580
with _temporary_env(self._resolve_env()):
581581
worker.run(callable_obj, chip_args, config=config)
@@ -619,7 +619,7 @@ def _run_and_validate_l3(
619619
for name, initial in initial_tensors.items():
620620
getattr(test_args, name).copy_(initial)
621621

622-
config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0))
622+
config = self._build_config(config_dict, enable_profiling=(enable_profiling if round_idx == 0 else 0))
623623

624624
# Wrap in Task — user orch signature: (orch, callables, task_args, config)
625625
def task_orch(orch, _unused, _ns=ns, _test_args=test_args, _config=config):
@@ -685,7 +685,15 @@ def run_module(module_name):
685685
parser.add_argument("--all-cases", action="store_true", help="Include manual cases")
686686
parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)")
687687
parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)")
688-
parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)")
688+
parser.add_argument(
689+
"--enable-profiling",
690+
type=int,
691+
nargs="?",
692+
const=3,
693+
default=0,
694+
metavar="LEVEL",
695+
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
696+
)
689697
parser.add_argument("--build", action="store_true", help="Compile runtime from source")
690698
parser.add_argument(
691699
"--log-level",

src/a2a3/platform/include/host/performance_collector.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,12 @@ class PerformanceCollector {
332332
*/
333333
bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }
334334

335+
/**
336+
* Set profiling level before initialize().
337+
* 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
338+
*/
339+
void set_perf_level(int level) { perf_level_ = level; }
340+
335341
/**
336342
* Drain remaining buffers from the memory manager's ready queue
337343
*
@@ -387,6 +393,9 @@ class PerformanceCollector {
387393
PerfRegisterCallback register_cb_{nullptr};
388394
PerfFreeCallback free_cb_{nullptr};
389395

396+
// Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
397+
int perf_level_{0};
398+
390399
// Memory manager
391400
ProfMemoryManager memory_manager_;
392401

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Copyright (c) PyPTO Contributors.
3+
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
4+
* CANN Open Software License Agreement Version 2.0 (the "License").
5+
* Please refer to the License for details. You may not use this file except in compliance with the License.
6+
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
7+
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
8+
* See LICENSE in the root of the software repository for the full text of the License.
9+
* -----------------------------------------------------------------------------------------------------------
10+
*/
11+
12+
/**
13+
* Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
14+
*
15+
* Used by both onboard and sim pto_runtime_c_api.cpp implementations.
16+
* Some runtime structs still carry a bool enable_profiling member alongside
17+
* the newer int perf_level. This template detects the legacy member at
18+
* compile time and keeps both in sync.
19+
*/
20+
21+
#pragma once
22+
23+
#include <type_traits>
24+
25+
template <typename T, typename = void>
26+
struct HasEnableProfilingMember : std::false_type {};
27+
28+
template <typename T>
29+
struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};
30+
31+
template <typename R>
32+
static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
33+
runtime->perf_level = enable_profiling;
34+
if constexpr (HasEnableProfilingMember<R>::value) {
35+
runtime->enable_profiling = (enable_profiling > 0);
36+
}
37+
}

src/a2a3/platform/onboard/host/device_runner.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ int DeviceRunner::run(
461461
});
462462

463463
// Initialize performance profiling if enabled
464-
if (runtime.enable_profiling) {
464+
if (runtime.perf_level > 0) {
465465
rc = init_performance_profiling(runtime, num_aicore, device_id);
466466
if (rc != 0) {
467467
LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -530,13 +530,13 @@ int DeviceRunner::run(
530530
{
531531
// Poll and collect performance data in a separate collector thread
532532
std::thread collector_thread;
533-
if (runtime.enable_profiling) {
533+
if (runtime.perf_level > 0) {
534534
collector_thread = create_thread([this, &runtime]() {
535535
poll_and_collect_performance_data(runtime.get_task_count());
536536
});
537537
}
538538
auto thread_guard = RAIIScopeGuard([&]() {
539-
if (runtime.enable_profiling && collector_thread.joinable()) {
539+
if (runtime.perf_level > 0 && collector_thread.joinable()) {
540540
collector_thread.join();
541541
}
542542
});
@@ -557,13 +557,13 @@ int DeviceRunner::run(
557557
}
558558

559559
// Signal collector that device execution is complete
560-
if (runtime.enable_profiling) {
560+
if (runtime.perf_level > 0) {
561561
perf_collector_.signal_execution_complete();
562562
}
563563
}
564564

565565
// Stop memory management, drain remaining buffers, collect phase data, export
566-
if (runtime.enable_profiling) {
566+
if (runtime.perf_level > 0) {
567567
perf_collector_.stop_memory_manager();
568568
perf_collector_.drain_remaining_buffers();
569569
perf_collector_.scan_remaining_perf_buffers();
@@ -822,6 +822,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
822822
return rtFree(dev_ptr);
823823
};
824824

825+
perf_collector_.set_perf_level(runtime.perf_level);
825826
return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
826827
}
827828

src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "common/unified_log.h"
2727
#include "device_runner.h"
2828
#include "host/raii_scope_guard.h"
29+
#include "host/runtime_profiling_mode.h"
2930
#include "runtime.h"
3031

3132
extern "C" {
@@ -162,9 +163,7 @@ int run_runtime(
162163
return rc;
163164
}
164165

165-
if (enable_profiling) {
166-
r->enable_profiling = true;
167-
}
166+
set_runtime_profiling_mode(r, enable_profiling);
168167

169168
std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
170169
std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);

0 commit comments

Comments
 (0)