Skip to content

Commit 901edea

Browse files
committed
Fix: unify tensor dump control under profiling flags
- add `enable_profiling_flag` to the AICPU/AICore handshake and initialize the dump bit in onboard and sim device runners - replace `PTO2_DUMP_TENSOR` guards with `PTO2_PROFILING` and remove the old per-runtime dump macro definitions - add an AICore pipe barrier before completion when dumping tensors to preserve write visibility for dumps
1 parent 6800c38 commit 901edea

33 files changed

Lines changed: 185 additions & 120 deletions

File tree

src/a2a3/platform/include/common/kernel_args.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ struct KernelArgs {
7272
__may_used_by_aicore__ Runtime *runtime_args{nullptr}; // Task runtime in device memory
7373
uint64_t regs{0}; // Per-core register base address array (platform-specific)
7474
uint64_t ffts_base_addr{0}; // FFTS base address for AICore
75-
uint64_t dump_data_base{0}; // Dump shared memory base address, zero when unused
75+
uint64_t dump_data_base{0}; // Dump shared memory base address; use explicit flags to detect enablement
7676
};
7777

7878
#ifdef __cplusplus

src/a2a3/platform/include/common/platform_config.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,13 @@ inline double cycles_to_us(uint64_t cycles) {
142142
return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
143143
}
144144

145+
// Profiling-related runtime flags shared through AICPU-AICore handshake.
146+
#define PROFILING_FLAG_NONE 0u
147+
#define PROFILING_FLAG_DUMP_TENSOR (1u << 0)
148+
#define GET_PROFILING_FLAG(flags, bit) ((((uint32_t)(flags)) & ((uint32_t)(bit))) != 0u)
149+
#define SET_PROFILING_FLAG(flags, bit) ((flags) |= (uint32_t)(bit))
150+
#define CLEAR_PROFILING_FLAG(flags, bit) ((flags) &= ~((uint32_t)(bit)))
151+
145152
// =============================================================================
146153
// Tensor Dump Configuration
147154
// =============================================================================

src/a2a3/platform/include/common/tensor_dump.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* Fully decoupled from profiling — uses its own ready queues, buffer states,
1818
* and memory manager thread.
1919
*
20-
* Memory layout (Dump SHM, allocated only when PTO2_DUMP_TENSOR=1):
20+
* Memory layout (Dump SHM, allocated only when PTO2_PROFILING=1):
2121
* ┌─────────────────────────────────────────────────────────────┐
2222
* │ DumpDataHeader (fixed header) │
2323
* │ - Per-thread ready queues (circular FIFOs) │

src/a2a3/platform/onboard/aicpu/kernel.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,11 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
8080
}
8181

8282
// Store platform regs before calling aicpu_execute
83+
// Dump enable is an execution control flag propagated via handshake.
84+
// The dump base address is only the backing storage location.
8385
set_platform_regs(k_args->regs);
8486
set_platform_dump_base(k_args->dump_data_base);
85-
set_enable_dump_tensor(k_args->dump_data_base != 0);
87+
set_enable_dump_tensor(GET_PROFILING_FLAG(runtime->workers[0].enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR));
8688

8789
// Affinity gate: drop excess threads before entering runtime
8890
if (!platform_aicpu_affinity_gate(runtime->sche_cpu_num, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) {

src/a2a3/platform/onboard/host/device_runner.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ int DeviceRunner::run(
421421

422422
// Calculate number of AIC cores (1/3 of total)
423423
int num_aic = block_dim; // Round up for 1/3
424+
const uint32_t enable_profiling_flag = enable_dump_tensor ? PROFILING_FLAG_DUMP_TENSOR : PROFILING_FLAG_NONE;
424425

425426
for (int i = 0; i < num_aicore; i++) {
426427
runtime.workers[i].aicpu_ready = 0;
@@ -430,6 +431,7 @@ int DeviceRunner::run(
430431
runtime.workers[i].task_status = 0;
431432
// Set core type: first 1/3 are AIC, remaining 2/3 are AIV
432433
runtime.workers[i].core_type = (i < num_aic) ? CoreType::AIC : CoreType::AIV;
434+
runtime.workers[i].enable_profiling_flag = enable_profiling_flag;
433435
runtime.workers[i].perf_records_addr = static_cast<uint64_t>(0);
434436
runtime.workers[i].perf_buffer_status = 0;
435437
}

src/a2a3/platform/sim/aicore/inner_kernel.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ typedef int mem_dsb_t;
5757
#define SINGLE_CACHE_LINE 0
5858
#define CACHELINE_OUT 0
5959

60+
// pipe_barrier - memory barrier in simulation (hardware pipeline synchronization)
61+
#define PIPE_ALL 0
62+
#define pipe_barrier(pipe) __sync_synchronize()
63+
6064
// SPIN_WAIT_HINT - CPU pause hint + OS yield for idle polling loops in simulation.
6165
// In simulation, all AICore/AICPU threads share a small number of host CPU cores.
6266
// The CPU hint (pause/yield) reduces pipeline waste, and sched_yield() lets the OS

src/a2a3/platform/sim/host/device_runner.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ int DeviceRunner::run(
286286

287287
// Calculate number of AIC cores
288288
int num_aic = block_dim;
289+
const uint32_t enable_profiling_flag = enable_dump_tensor ? PROFILING_FLAG_DUMP_TENSOR : PROFILING_FLAG_NONE;
289290

290291
for (int i = 0; i < num_aicore; i++) {
291292
runtime.workers[i].aicpu_ready = 0;
@@ -295,6 +296,7 @@ int DeviceRunner::run(
295296
runtime.workers[i].task_status = 0;
296297
// First 1/3 are AIC, remaining 2/3 are AIV
297298
runtime.workers[i].core_type = (i < num_aic) ? CoreType::AIC : CoreType::AIV;
299+
runtime.workers[i].enable_profiling_flag = enable_profiling_flag;
298300
}
299301

300302
// Set function_bin_addr for each task: func_id_to_addr_[] stores CoreCallable

src/a2a3/runtime/aicpu_build_graph/aicore/aicore_executor.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
8686
__gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task);
8787

8888
bool profiling_enabled = runtime->enable_profiling;
89+
bool dump_tensor_enabled = GET_PROFILING_FLAG(my_hank->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
8990

9091
// Phase 4: Main execution loop - poll register for tasks until exit signal
9192
// Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
@@ -120,6 +121,10 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
120121
// Execute the task
121122
execute_task(payload);
122123

124+
if (dump_tensor_enabled) {
125+
pipe_barrier(PIPE_ALL);
126+
}
127+
123128
// Performance profiling: record task execution
124129
// (func_id and core_type are filled by AICPU at completion time)
125130
if (profiling_enabled) {

src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ struct AicpuExecutor {
406406
);
407407
cur_thread_completed++;
408408
if (mixed_complete) {
409-
#if PTO2_DUMP_TENSOR
409+
#if PTO2_PROFILING
410410
if (get_enable_dump_tensor()) {
411411
dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
412412
thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION,
@@ -515,7 +515,7 @@ struct AicpuExecutor {
515515
,
516516
bool profiling_enabled
517517
#endif
518-
#if PTO2_PROFILING || PTO2_DUMP_TENSOR
518+
#if PTO2_PROFILING
519519
,
520520
int32_t thread_idx
521521
#endif
@@ -944,7 +944,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
944944
perf_aicpu_set_orch_thread_idx(sched_thread_num_);
945945
}
946946
#endif
947-
#if PTO2_DUMP_TENSOR
947+
#if PTO2_PROFILING
948948
if (get_enable_dump_tensor()) {
949949
dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_);
950950
}
@@ -1163,7 +1163,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
11631163
uint64_t t_setup_start = get_sys_cnt_aicpu();
11641164
#endif
11651165
ResourceCount rc = shape_resource_count(shape);
1166-
#if PTO2_DUMP_TENSOR
1166+
#if PTO2_PROFILING
11671167
if (get_enable_dump_tensor()) {
11681168
dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
11691169
thread_idx, *slot_state, TensorDumpStage::BEFORE_DISPATCH,
@@ -1183,7 +1183,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
11831183
,
11841184
profiling_enabled
11851185
#endif
1186-
#if PTO2_PROFILING || PTO2_DUMP_TENSOR
1186+
#if PTO2_PROFILING
11871187
,
11881188
thread_idx
11891189
#endif
@@ -1197,7 +1197,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
11971197
,
11981198
profiling_enabled
11991199
#endif
1200-
#if PTO2_PROFILING || PTO2_DUMP_TENSOR
1200+
#if PTO2_PROFILING
12011201
,
12021202
thread_idx
12031203
#endif
@@ -1210,7 +1210,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
12101210
,
12111211
profiling_enabled
12121212
#endif
1213-
#if PTO2_PROFILING || PTO2_DUMP_TENSOR
1213+
#if PTO2_PROFILING
12141214
,
12151215
thread_idx
12161216
#endif
@@ -1272,7 +1272,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
12721272
#endif
12731273
Cluster &c = tracker.clusters[ci];
12741274
ResourceCount rc = shape_resource_count(shape);
1275-
#if PTO2_DUMP_TENSOR
1275+
#if PTO2_PROFILING
12761276
if (get_enable_dump_tensor()) {
12771277
dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
12781278
thread_idx, *slot_state, TensorDumpStage::BEFORE_DISPATCH,
@@ -1292,7 +1292,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
12921292
,
12931293
profiling_enabled
12941294
#endif
1295-
#if PTO2_PROFILING || PTO2_DUMP_TENSOR
1295+
#if PTO2_PROFILING
12961296
,
12971297
thread_idx
12981298
#endif
@@ -1306,7 +1306,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
13061306
,
13071307
profiling_enabled
13081308
#endif
1309-
#if PTO2_PROFILING || PTO2_DUMP_TENSOR
1309+
#if PTO2_PROFILING
13101310
,
13111311
thread_idx
13121312
#endif
@@ -1319,7 +1319,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
13191319
,
13201320
profiling_enabled
13211321
#endif
1322-
#if PTO2_PROFILING || PTO2_DUMP_TENSOR
1322+
#if PTO2_PROFILING
13231323
,
13241324
thread_idx
13251325
#endif
@@ -1650,7 +1650,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
16501650
perf_aicpu_flush_phase_buffers(thread_idx);
16511651
}
16521652
#endif
1653-
#if PTO2_DUMP_TENSOR
1653+
#if PTO2_PROFILING
16541654
if (get_enable_dump_tensor()) {
16551655
dump_tensor_flush(thread_idx);
16561656
}

src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,6 @@
5858
#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
5959
#endif
6060

61-
// =============================================================================
62-
// Dump Tensor Configuration
63-
// =============================================================================
64-
65-
#ifndef PTO2_DUMP_TENSOR
66-
#define PTO2_DUMP_TENSOR 1
67-
#endif
68-
6961
// =============================================================================
7062
// AICPU Error Codes (written to shared memory for Host-side diagnosis)
7163
// =============================================================================

0 commit comments

Comments
 (0)