Skip to content

Commit a0f75e3

Browse files
committed
Refactor: introduce tiered profiling levels for a2a3 tensormap_and_ringbuffer swimlane export
Replace the boolean enable_profiling flag with an integer perf_level throughout the profiling pipeline (CLI → Python bindings → C++ runtime → AICPU executor → PerformanceCollector → JSON export). Profiling levels: 0 = off 1 = AICore task start/end timing only (JSON version 0) 2 = + dispatch timestamps, finish timestamps, fanout edges (JSON version 1) 3 = + AICPU scheduler/orchestrator phase buffers (JSON version 2) Key changes: - ChipCallConfig: bool enable_profiling → int perf_level - CLI --enable-profiling: store_true → optional int (bare flag defaults to 3) - nanobind property: backward-compatible bool→3 coercion for legacy callers - AICPU executor: split into task_recording_enabled (>0) vs phase_recording_enabled (>=3) to skip phase overhead at lower levels - PerformanceCollector: skip phase buffer allocation when perf_level < 3; version selection based on perf_level and presence of phase data - swimlane_converter.py: accept version 0, tolerate missing fanout field - Fix scene_test.py: `val and cond` truncated int to bool; use ternary
1 parent e3b07d9 commit a0f75e3

28 files changed

Lines changed: 279 additions & 166 deletions

File tree

conftest.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,12 @@ def pytest_addoption(parser):
8484
"--skip-golden", action="store_true", default=False, help="Skip golden comparison (benchmark mode)"
8585
)
8686
parser.addoption(
87-
"--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"
87+
"--enable-profiling",
88+
type=int,
89+
nargs="?",
90+
const=3,
91+
default=0,
92+
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
8893
)
8994
parser.addoption("--dump-tensor", action="store_true", default=False, help="Dump per-task tensor I/O at runtime")
9095
parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")

examples/scripts/run_example.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
139139

140140
parser.add_argument(
141141
"--enable-profiling",
142-
action="store_true",
143-
help="Enable profiling and generate swimlane.json",
142+
type=int,
143+
nargs="?",
144+
const=3,
145+
default=0,
146+
metavar="LEVEL",
147+
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
144148
)
145149

146150
parser.add_argument(

python/bindings/task_interface.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -540,12 +540,24 @@ NB_MODULE(_task_interface, m) {
540540
.def(nb::init<>())
541541
.def_rw("block_dim", &ChipCallConfig::block_dim)
542542
.def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
543-
.def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
543+
.def_prop_rw(
544+
"enable_profiling",
545+
[](const ChipCallConfig &self) {
546+
return self.perf_level;
547+
},
548+
[](ChipCallConfig &self, nb::object v) {
549+
if (nb::isinstance<nb::bool_>(v)) {
550+
self.perf_level = nb::cast<bool>(v) ? 3 : 0;
551+
} else {
552+
self.perf_level = nb::cast<int>(v);
553+
}
554+
}
555+
)
544556
.def_rw("enable_dump_tensor", &ChipCallConfig::enable_dump_tensor)
545557
.def("__repr__", [](const ChipCallConfig &self) -> std::string {
546558
std::ostringstream os;
547559
os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
548-
<< ", enable_profiling=" << (self.enable_profiling ? "True" : "False")
560+
<< ", enable_profiling=" << self.perf_level
549561
<< ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") << ")";
550562
return os.str();
551563
});
@@ -571,29 +583,29 @@ NB_MODULE(_task_interface, m) {
571583
.def(
572584
"run_raw",
573585
[](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
574-
bool enable_profiling) {
586+
int perf_level) {
575587
ChipCallConfig config;
576588
config.block_dim = block_dim;
577589
config.aicpu_thread_num = aicpu_thread_num;
578-
config.enable_profiling = enable_profiling;
590+
config.perf_level = perf_level;
579591
self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
580592
},
581593
nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
582-
nb::arg("enable_profiling") = false, "Run with a raw ChipStorageTaskArgs POD pointer."
594+
nb::arg("perf_level") = 0, "Run with a raw ChipStorageTaskArgs POD pointer."
583595
)
584596
.def(
585597
"run_from_blob",
586598
[](ChipWorker &self, uint64_t callable, uint64_t blob_ptr, int block_dim, int aicpu_thread_num,
587-
bool enable_profiling) {
599+
int perf_level) {
588600
ChipCallConfig config;
589601
config.block_dim = block_dim;
590602
config.aicpu_thread_num = aicpu_thread_num;
591-
config.enable_profiling = enable_profiling;
603+
config.perf_level = perf_level;
592604
TaskArgsView view = read_blob(reinterpret_cast<const uint8_t *>(blob_ptr));
593605
self.run(callable, view, config);
594606
},
595607
nb::arg("callable"), nb::arg("blob_ptr"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
596-
nb::arg("enable_profiling") = false,
608+
nb::arg("perf_level") = 0,
597609
"Decode a length-prefixed TaskArgs blob ([T][S][tensors][scalars]) at "
598610
"blob_ptr and dispatch to the runtime. Used from forked chip processes "
599611
"reading the WorkerThread mailbox."

python/simpler/worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def _chip_process_loop(
161161

162162
error = 0
163163
try:
164-
cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, bool(profiling))
164+
cw.run_from_blob(callable_ptr, args_ptr, block_dim, aicpu_tn, profiling)
165165
except Exception: # noqa: BLE001
166166
error = 1
167167
struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)

simpler_setup/code_runner.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ def _load_module_from_path(module_path: Path, module_name: str):
123123
return module
124124

125125

126+
def _normalize_perf_level(v) -> int:
127+
if isinstance(v, bool):
128+
return 3 if v else 0
129+
return int(v)
130+
131+
126132
def _kernel_config_runtime_env(kernel_config_module, kernels_dir: Path) -> dict[str, str]:
127133
"""
128134
Optional per-example environment variables for runtime compilation.
@@ -192,7 +198,7 @@ def __init__( # noqa: PLR0913
192198
golden_path: str,
193199
device_id: Optional[int] = None,
194200
platform: str = "a2a3",
195-
enable_profiling: bool = False,
201+
enable_profiling: int = 0,
196202
enable_dump_tensor: bool = False,
197203
run_all_cases: bool = False,
198204
case_name: Optional[str] = None,
@@ -212,7 +218,7 @@ def __init__( # noqa: PLR0913
212218
self.kernels_dir = Path(kernels_dir).resolve()
213219
self.golden_path = Path(golden_path).resolve()
214220
self.platform = platform
215-
self.enable_profiling = enable_profiling
221+
self._perf_level = _normalize_perf_level(enable_profiling)
216222
self.enable_dump_tensor = enable_dump_tensor
217223
self.skip_golden = skip_golden
218224
self.project_root = PROJECT_ROOT
@@ -607,9 +613,9 @@ def _compile_one_kernel(kernel):
607613
config = ChipCallConfig()
608614
config.block_dim = self.block_dim
609615
config.aicpu_thread_num = self.aicpu_thread_num
610-
if self.enable_profiling and round_idx == 0:
611-
config.enable_profiling = True
612-
logger.info("Profiling enabled")
616+
if self._perf_level > 0 and round_idx == 0:
617+
config.enable_profiling = self._perf_level
618+
logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")
613619
if self.enable_dump_tensor:
614620
config.enable_dump_tensor = True
615621
logger.info("Dump tensor enabled")

simpler_setup/scene_test.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,7 @@ def build_callable(self, platform):
700700
return self._compile_l3_callables(platform)
701701
raise ValueError(f"Unsupported level: {self._st_level}")
702702

703-
def _build_config(self, config_dict, enable_profiling=False, enable_dump_tensor=False):
703+
def _build_config(self, config_dict, enable_profiling=0, enable_dump_tensor=False):
704704
from simpler.task_interface import ChipCallConfig # noqa: PLC0415
705705

706706
config = ChipCallConfig()
@@ -791,7 +791,7 @@ def _run_and_validate_l2(
791791

792792
config = self._build_config(
793793
config_dict,
794-
enable_profiling=(enable_profiling and round_idx == 0),
794+
enable_profiling=(enable_profiling if round_idx == 0 else 0),
795795
enable_dump_tensor=enable_dump_tensor,
796796
)
797797

@@ -847,7 +847,7 @@ def _run_and_validate_l3(
847847

848848
config = self._build_config(
849849
config_dict,
850-
enable_profiling=(enable_profiling and round_idx == 0),
850+
enable_profiling=(enable_profiling if round_idx == 0 else 0),
851851
enable_dump_tensor=enable_dump_tensor,
852852
)
853853

@@ -948,7 +948,15 @@ def run_module(module_name):
948948
)
949949
parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)")
950950
parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)")
951-
parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)")
951+
parser.add_argument(
952+
"--enable-profiling",
953+
type=int,
954+
nargs="?",
955+
const=3,
956+
default=0,
957+
metavar="LEVEL",
958+
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
959+
)
952960
parser.add_argument("--dump-tensor", action="store_true", help="Dump per-task tensor I/O at runtime")
953961
parser.add_argument("--build", action="store_true", help="Compile runtime from source")
954962
parser.add_argument(

src/a2a3/platform/include/host/performance_collector.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,12 @@ class PerformanceCollector {
332332
*/
333333
bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }
334334

335+
/**
336+
* Set profiling level before initialize().
337+
* 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
338+
*/
339+
void set_perf_level(int level) { perf_level_ = level; }
340+
335341
/**
336342
* Drain remaining buffers from the memory manager's ready queue
337343
*
@@ -387,6 +393,9 @@ class PerformanceCollector {
387393
PerfRegisterCallback register_cb_{nullptr};
388394
PerfFreeCallback free_cb_{nullptr};
389395

396+
// Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
397+
int perf_level_{0};
398+
390399
// Memory manager
391400
ProfMemoryManager memory_manager_;
392401

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Copyright (c) PyPTO Contributors.
3+
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
4+
* CANN Open Software License Agreement Version 2.0 (the "License").
5+
* Please refer to the License for details. You may not use this file except in compliance with the License.
6+
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
7+
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
8+
* See LICENSE in the root of the software repository for the full text of the License.
9+
* -----------------------------------------------------------------------------------------------------------
10+
*/
11+
12+
/**
13+
* Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
14+
*
15+
* Used by both onboard and sim pto_runtime_c_api.cpp implementations.
16+
* Some runtime structs still carry a bool enable_profiling member alongside
17+
* the newer int perf_level. This template detects the legacy member at
18+
* compile time and keeps both in sync.
19+
*/
20+
21+
#pragma once
22+
23+
#include <type_traits>
24+
25+
template <typename T, typename = void>
26+
struct HasEnableProfilingMember : std::false_type {};
27+
28+
template <typename T>
29+
struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};
30+
31+
template <typename R>
32+
static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
33+
runtime->perf_level = enable_profiling;
34+
if constexpr (HasEnableProfilingMember<R>::value) {
35+
runtime->enable_profiling = (enable_profiling > 0);
36+
}
37+
}

src/a2a3/platform/onboard/host/device_runner.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ int DeviceRunner::run(
461461
});
462462

463463
// Initialize performance profiling if enabled
464-
if (runtime.enable_profiling) {
464+
if (runtime.perf_level > 0) {
465465
rc = init_performance_profiling(runtime, num_aicore, device_id);
466466
if (rc != 0) {
467467
LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -540,18 +540,18 @@ int DeviceRunner::run(
540540
{
541541
// Poll and collect performance data in a separate collector thread
542542
std::thread collector_thread;
543-
if (runtime.enable_profiling) {
543+
if (runtime.perf_level > 0) {
544544
collector_thread = create_thread([this, &runtime]() {
545545
poll_and_collect_performance_data(runtime.get_task_count());
546546
});
547547
}
548548
auto thread_guard = RAIIScopeGuard([&]() {
549-
if (runtime.enable_profiling && collector_thread.joinable()) {
549+
if (runtime.perf_level > 0 && collector_thread.joinable()) {
550550
collector_thread.join();
551551
}
552552
});
553553
auto collector_signal_guard = RAIIScopeGuard([this, &runtime]() {
554-
if (runtime.enable_profiling) {
554+
if (runtime.perf_level > 0) {
555555
perf_collector_.signal_execution_complete();
556556
}
557557
});
@@ -588,7 +588,7 @@ int DeviceRunner::run(
588588
}
589589

590590
// Stop memory management, drain remaining buffers, collect phase data, export
591-
if (runtime.enable_profiling) {
591+
if (runtime.perf_level > 0) {
592592
perf_collector_.stop_memory_manager();
593593
perf_collector_.drain_remaining_buffers();
594594
perf_collector_.scan_remaining_perf_buffers();
@@ -872,6 +872,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
872872
return rtFree(dev_ptr);
873873
};
874874

875+
perf_collector_.set_perf_level(runtime.perf_level);
875876
return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
876877
}
877878

src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "common/unified_log.h"
2727
#include "device_runner.h"
2828
#include "host/raii_scope_guard.h"
29+
#include "host/runtime_profiling_mode.h"
2930
#include "runtime.h"
3031

3132
extern "C" {
@@ -162,9 +163,7 @@ int run_runtime(
162163
return rc;
163164
}
164165

165-
if (enable_profiling) {
166-
r->enable_profiling = true;
167-
}
166+
set_runtime_profiling_mode(r, enable_profiling);
168167

169168
std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
170169
std::vector<uint8_t> aicore_vec(aicore_binary, aicore_binary + aicore_size);

0 commit comments

Comments
 (0)