Skip to content

Commit 0b39cb0

Browse files
committed
Refactor: introduce tiered profiling levels for a2a3 tensormap_and_ringbuffer swimlane export
Replace the boolean `enable_profiling` flag with a 4-level `perf_level` (0=off, 1=AICore-only, 2=task+fanout, 3=full with AICPU phase records). The tensormap_and_ringbuffer runtime honors all four levels, while legacy host_build_graph / aicpu_build_graph paths continue to treat any non-zero value as a simple on/off and stay on their existing bool member (synchronized via a shared SFINAE helper in runtime_profiling_mode.h). CLI and JSON are lifted to match: - `--enable-profiling` in run_example.py now takes an optional int (default 3 when flag given, 0 otherwise). - The swimlane JSON schema gains a new version=0 (level 1: AICore-only) that omits dispatch/finish/fanout fields, and swimlane_converter.py accepts it. - Phase buffer allocation, scheduler-phase recording and orchestrator summary writes in aicpu_executor.cpp are gated on perf_level>=3 so lower levels no longer pay the phase-profiling overhead; fanout/dispatch_timestamp collection is gated on perf_level>=2. Additionally: - CallConfig and WorkerPayload switch from bool to int; Python bindings accept both bool and int for backward compatibility (_normalize_perf_level in code_runner.py, getter/setter shim in task_interface.cpp). - PerformanceCollector skips phase-buffer shared-memory allocation and phase-thread management when perf_level < 3 (calc_perf_data_size path). - device_runner.cpp (onboard + sim): all enable_profiling guards replaced with perf_level > 0; set_perf_level() called before initialize(). - Unit tests updated for int-based profiling values.
1 parent c0d41a0 commit 0b39cb0

26 files changed

Lines changed: 256 additions & 156 deletions

File tree

examples/scripts/code_runner.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,12 @@ def _get_project_root() -> Path:
133133
return Path(__file__).parent.parent.parent # examples/scripts/ -> examples/ -> simpler/
134134

135135

136+
def _normalize_perf_level(v) -> int:
137+
if isinstance(v, bool):
138+
return 3 if v else 0
139+
return int(v)
140+
141+
136142
def _get_pto_isa_clone_path() -> Path:
137143
"""Get the expected path to pto-isa clone."""
138144
return _get_project_root() / "examples" / "scripts" / "_deps" / "pto-isa"
@@ -477,7 +483,7 @@ def __init__( # noqa: PLR0913
477483
golden_path: str,
478484
device_id: Optional[int] = None,
479485
platform: str = "a2a3",
480-
enable_profiling: bool = False,
486+
enable_profiling: int = 0,
481487
run_all_cases: bool = False,
482488
case_name: Optional[str] = None,
483489
pto_isa_commit: Optional[str] = None,
@@ -492,7 +498,7 @@ def __init__( # noqa: PLR0913
492498
self.kernels_dir = Path(kernels_dir).resolve()
493499
self.golden_path = Path(golden_path).resolve()
494500
self.platform = platform
495-
self.enable_profiling = enable_profiling
501+
self._perf_level = _normalize_perf_level(enable_profiling)
496502
self.skip_golden = skip_golden
497503
self.project_root = _get_project_root()
498504

@@ -887,9 +893,9 @@ def _compile_one_kernel(kernel):
887893
config = ChipCallConfig()
888894
config.block_dim = self.block_dim
889895
config.aicpu_thread_num = self.aicpu_thread_num
890-
if self.enable_profiling and round_idx == 0:
891-
config.enable_profiling = True
892-
logger.info("Profiling enabled")
896+
if self._perf_level > 0 and round_idx == 0:
897+
config.enable_profiling = self._perf_level
898+
logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")
893899

894900
with _temporary_env(run_env):
895901
worker.run(chip_callable, orch_args, config)

examples/scripts/run_example.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
148148

149149
parser.add_argument(
150150
"--enable-profiling",
151-
action="store_true",
152-
help="Enable profiling and generate swimlane.json",
151+
type=int,
152+
nargs="?",
153+
const=3,
154+
default=0,
155+
metavar="LEVEL",
156+
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
153157
)
154158

155159
parser.add_argument(

python/bindings/task_interface.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -579,11 +579,23 @@ NB_MODULE(_task_interface, m) {
579579
.def(nb::init<>())
580580
.def_rw("block_dim", &ChipCallConfig::block_dim)
581581
.def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
582-
.def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
582+
.def_prop_rw(
583+
"enable_profiling",
584+
[](const ChipCallConfig &self) {
585+
return self.perf_level;
586+
},
587+
[](ChipCallConfig &self, nb::object v) {
588+
if (nb::isinstance<nb::bool_>(v)) {
589+
self.perf_level = nb::cast<bool>(v) ? 3 : 0;
590+
} else {
591+
self.perf_level = nb::cast<int>(v);
592+
}
593+
}
594+
)
583595
.def("__repr__", [](const ChipCallConfig &self) -> std::string {
584596
std::ostringstream os;
585597
os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
586-
<< ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")";
598+
<< ", enable_profiling=" << self.perf_level << ")";
587599
return os.str();
588600
});
589601

@@ -608,15 +620,15 @@ NB_MODULE(_task_interface, m) {
608620
.def(
609621
"run_raw",
610622
[](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
611-
bool enable_profiling) {
623+
int perf_level) {
612624
ChipCallConfig config;
613625
config.block_dim = block_dim;
614626
config.aicpu_thread_num = aicpu_thread_num;
615-
config.enable_profiling = enable_profiling;
627+
config.perf_level = perf_level;
616628
self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
617629
},
618630
nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
619-
nb::arg("enable_profiling") = false, "Run with raw pointer arguments (used from forked chip process)."
631+
nb::arg("perf_level") = 0, "Run with raw pointer arguments (used from forked chip process)."
620632
)
621633
.def_prop_ro("device_id", &ChipWorker::device_id)
622634
.def_prop_ro("initialized", &ChipWorker::initialized)

python/simpler/worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def _chip_process_loop(
174174

175175
error = 0
176176
try:
177-
cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, bool(profiling))
177+
cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, profiling)
178178
except Exception: # noqa: BLE001
179179
error = 1
180180
struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)

src/a2a3/platform/include/host/performance_collector.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,12 @@ class PerformanceCollector {
332332
*/
333333
bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }
334334

335+
/**
336+
* Set profiling level before initialize().
337+
* 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
338+
*/
339+
void set_perf_level(int level) { perf_level_ = level; }
340+
335341
/**
336342
* Drain remaining buffers from the memory manager's ready queue
337343
*
@@ -387,6 +393,9 @@ class PerformanceCollector {
387393
PerfRegisterCallback register_cb_{nullptr};
388394
PerfFreeCallback free_cb_{nullptr};
389395

396+
// Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
397+
int perf_level_{0};
398+
390399
// Memory manager
391400
ProfMemoryManager memory_manager_;
392401

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Copyright (c) PyPTO Contributors.
3+
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
4+
* CANN Open Software License Agreement Version 2.0 (the "License").
5+
* Please refer to the License for details. You may not use this file except in compliance with the License.
6+
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
7+
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
8+
* See LICENSE in the root of the software repository for the full text of the License.
9+
* -----------------------------------------------------------------------------------------------------------
10+
*/
11+
12+
/**
13+
* Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
14+
*
15+
* Used by both onboard and sim pto_runtime_c_api.cpp implementations.
16+
* Some runtime structs still carry a bool enable_profiling member alongside
17+
* the newer int perf_level. This template detects the legacy member at
18+
* compile time and keeps both in sync.
19+
*/
20+
21+
#pragma once
22+
23+
#include <type_traits>
24+
25+
template <typename T, typename = void>
26+
struct HasEnableProfilingMember : std::false_type {};
27+
28+
template <typename T>
29+
struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};
30+
31+
template <typename R>
32+
static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
33+
runtime->perf_level = enable_profiling;
34+
if constexpr (HasEnableProfilingMember<R>::value) {
35+
runtime->enable_profiling = (enable_profiling > 0);
36+
}
37+
}

src/a2a3/platform/onboard/host/device_runner.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ int DeviceRunner::run(
461461
});
462462

463463
// Initialize performance profiling if enabled
464-
if (runtime.enable_profiling) {
464+
if (runtime.perf_level > 0) {
465465
rc = init_performance_profiling(runtime, num_aicore, device_id);
466466
if (rc != 0) {
467467
LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -530,13 +530,13 @@ int DeviceRunner::run(
530530
{
531531
// Poll and collect performance data in a separate collector thread
532532
std::thread collector_thread;
533-
if (runtime.enable_profiling) {
533+
if (runtime.perf_level > 0) {
534534
collector_thread = create_thread([this, &runtime]() {
535535
poll_and_collect_performance_data(runtime.get_task_count());
536536
});
537537
}
538538
auto thread_guard = RAIIScopeGuard([&]() {
539-
if (runtime.enable_profiling && collector_thread.joinable()) {
539+
if (runtime.perf_level > 0 && collector_thread.joinable()) {
540540
collector_thread.join();
541541
}
542542
});
@@ -557,13 +557,13 @@ int DeviceRunner::run(
557557
}
558558

559559
// Signal collector that device execution is complete
560-
if (runtime.enable_profiling) {
560+
if (runtime.perf_level > 0) {
561561
perf_collector_.signal_execution_complete();
562562
}
563563
}
564564

565565
// Stop memory management, drain remaining buffers, collect phase data, export
566-
if (runtime.enable_profiling) {
566+
if (runtime.perf_level > 0) {
567567
perf_collector_.stop_memory_manager();
568568
perf_collector_.drain_remaining_buffers();
569569
perf_collector_.scan_remaining_perf_buffers();
@@ -822,6 +822,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
822822
return rtFree(dev_ptr);
823823
};
824824

825+
perf_collector_.set_perf_level(runtime.perf_level);
825826
return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
826827
}
827828

src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "common/unified_log.h"
2727
#include "device_runner.h" // NOLINT(build/include_subdir)
2828
#include "runtime.h" // NOLINT(build/include_subdir)
29+
#include "host/runtime_profiling_mode.h"
2930

3031
extern "C" {
3132

@@ -162,9 +163,7 @@ int run_runtime(
162163
}
163164

164165
// Phase 2: profiling
165-
if (enable_profiling) {
166-
r->enable_profiling = true;
167-
}
166+
set_runtime_profiling_mode(r, enable_profiling);
168167

169168
// Phase 3: launch
170169
std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);

src/a2a3/platform/sim/host/device_runner.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ int DeviceRunner::run(
299299
last_runtime_ = &runtime;
300300

301301
// Initialize performance profiling if enabled
302-
if (runtime.enable_profiling) {
302+
if (runtime.perf_level > 0) {
303303
rc = init_performance_profiling(runtime, num_aicore, device_id);
304304
if (rc != 0) {
305305
LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -388,7 +388,7 @@ int DeviceRunner::run(
388388

389389
// Poll and collect performance data during execution (if enabled)
390390
std::thread collector_thread;
391-
if (runtime.enable_profiling) {
391+
if (runtime.perf_level > 0) {
392392
collector_thread = create_thread([this, &runtime]() {
393393
poll_and_collect_performance_data(runtime.get_task_count());
394394
});
@@ -404,19 +404,19 @@ int DeviceRunner::run(
404404
}
405405

406406
// Signal collector that device execution is complete
407-
if (runtime.enable_profiling) {
407+
if (runtime.perf_level > 0) {
408408
perf_collector_.signal_execution_complete();
409409
}
410410

411411
// Wait for collector thread if it was launched
412-
if (runtime.enable_profiling && collector_thread.joinable()) {
412+
if (runtime.perf_level > 0 && collector_thread.joinable()) {
413413
collector_thread.join();
414414
}
415415

416416
LOG_INFO("All threads completed");
417417

418418
// Stop memory management, drain remaining buffers, collect phase data, export
419-
if (runtime.enable_profiling) {
419+
if (runtime.perf_level > 0) {
420420
perf_collector_.stop_memory_manager();
421421
perf_collector_.drain_remaining_buffers();
422422
perf_collector_.scan_remaining_perf_buffers();
@@ -623,6 +623,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
623623
// =============================================================================
624624

625625
int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, int device_id) {
626+
perf_collector_.set_perf_level(runtime.perf_level);
626627
// Define allocation callback (a2a3sim: use malloc)
627628
auto alloc_cb = [](size_t size) -> void * {
628629
return malloc(size);

src/a2a3/platform/sim/host/pto_runtime_c_api.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "cpu_sim_context.h" // NOLINT(build/include_subdir)
2929
#include "device_runner.h" // NOLINT(build/include_subdir)
3030
#include "runtime.h" // NOLINT(build/include_subdir)
31+
#include "host/runtime_profiling_mode.h"
3132

3233
extern "C" {
3334

@@ -154,9 +155,7 @@ int run_runtime(
154155
}
155156

156157
// Phase 2: profiling
157-
if (enable_profiling) {
158-
r->enable_profiling = true;
159-
}
158+
set_runtime_profiling_mode(r, enable_profiling);
160159

161160
// Phase 3: launch
162161
std::vector<uint8_t> aicpu_vec;

0 commit comments

Comments
 (0)