diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h index 596f52a16..89744cd4e 100644 --- a/src/platform/include/common/perf_profiling.h +++ b/src/platform/include/common/perf_profiling.h @@ -205,7 +205,7 @@ enum class AicpuPhaseId : uint32_t { SCHED_COMPLETE = 0, // Process completed tasks (fanout traversal) SCHED_DISPATCH = 1, // Dispatch ready tasks to idle cores SCHED_SCAN = 2, // Incremental scan for root tasks - SCHED_EARLY_READY = 3, // Drain orchestrator's early-ready queue + SCHED_IDLE_WAIT = 3, // Idle/spinning (no progress) // Orchestrator phases (16-24) ORCH_SYNC = 16, // tensormap sync ORCH_ALLOC = 17, // task_ring_alloc diff --git a/src/platform/src/host/performance_collector.cpp b/src/platform/src/host/performance_collector.cpp index ea7a86cab..4b49210dd 100644 --- a/src/platform/src/host/performance_collector.cpp +++ b/src/platform/src/host/performance_collector.cpp @@ -511,7 +511,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) { case AicpuPhaseId::SCHED_COMPLETE: phase_name = "complete"; break; case AicpuPhaseId::SCHED_DISPATCH: phase_name = "dispatch"; break; case AicpuPhaseId::SCHED_SCAN: phase_name = "scan"; break; - case AicpuPhaseId::SCHED_EARLY_READY: phase_name = "early_ready"; break; + case AicpuPhaseId::SCHED_IDLE_WAIT: phase_name = "idle"; break; default: break; } diff --git a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 266c44500..063260e73 100644 --- a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -38,11 +38,11 @@ #include "aicpu/platform_regs.h" // Scheduler profiling helper -#ifndef PTO2_ORCH_PROFILING -#define PTO2_ORCH_PROFILING 1 +#ifndef PTO2_PROFILING +#define PTO2_PROFILING 1 #endif -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING // Accumulated nanoseconds per sub-step #define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 #define CYCLE_COUNT_LAP(acc) do { _t1 = get_sys_cnt_aicpu(); acc += (_t1 - _t0); _t0 = _t1; } while(0) @@ -473,21 +473,31 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, int32_t last_reported_task_count = 0; // Scheduler profiling counters -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING uint64_t sched_scan_cycle = 0; - uint64_t sched_orch_drain_cycle = 0; uint64_t sched_complete_cycle = 0; uint64_t sched_dispatch_cycle = 0; - uint64_t sched_yield_cycle = 0; + uint64_t sched_idle_cycle = 0; uint64_t sched_loop_count = 0; - uint64_t sched_yield_count = 0; + uint64_t notify_edges_total = 0; + int32_t notify_max_degree = 0; + uint64_t notify_tasks_enqueued = 0; + uint64_t fanin_edges_total = 0; + int32_t fanin_max_degree = 0; + uint64_t pop_hit = 0; + uint64_t pop_miss = 0; + uint32_t phase_complete_count = 0; + uint32_t phase_dispatch_count = 0; #endif while (true) { -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING sched_loop_count++; #endif CYCLE_COUNT_START(); +#if PTO2_PROFILING + uint64_t _t0_phase = _t0; +#endif // Dynamic task_count (Thread 3 sets total_tasks_ when orchestration completes) int32_t task_count = total_tasks_.load(std::memory_order_acquire); bool orch_done = orchestrator_done_.load(std::memory_order_acquire); @@ -531,7 +541,15 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id]; int32_t task_id = executing_task_ids_[core_id]; - rt->scheduler.on_task_complete(task_id); + PTO2CompletionStats cstats = rt->scheduler.on_task_complete(task_id); +#if PTO2_PROFILING + notify_edges_total += cstats.fanout_edges; + if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges; + notify_tasks_enqueued += cstats.tasks_enqueued; + fanin_edges_total += cstats.fanin_edges; + if (cstats.fanin_edges > fanin_max_degree) fanin_max_degree = cstats.fanin_edges; + phase_complete_count++; +#endif executing_task_ids_[core_id] = AICPU_TASK_INVALID; // Write AICPU dispatch/finish timestamps into the PerfRecord @@ -568,6 +586,14 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, } } CYCLE_COUNT_LAP(sched_complete_cycle); +#if PTO2_PROFILING + if (profiling_enabled && phase_complete_count > 0) { + perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_COMPLETE, + _t0_phase, _t1, sched_loop_count, phase_complete_count); + _t0_phase = _t1; + phase_complete_count = 0; + } +#endif // Phase 2: Dispatch ready tasks to idle cores (register-based dispatch) if (cur_thread_tasks_in_flight < core_num) { @@ -581,6 +607,10 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, PTO2WorkerType wt = (h->core_type == CoreType::AIC) ? PTO2_WORKER_CUBE : PTO2_WORKER_VECTOR; int32_t task_id = rt->scheduler.get_ready_task(wt); if (task_id >= 0) { +#if PTO2_PROFILING + pop_hit++; + phase_dispatch_count++; +#endif PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask]; PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id]; build_pto2_payload(payload, runtime, task, task_descriptors, dep_list_pool, window_size); @@ -598,11 +628,23 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, cur_thread_tasks_in_flight++; made_progress = true; DEV_DEBUG("Thread %d: Dispatching PTO2 task %d to core %d", thread_idx, task_id, core_id); + } else { +#if PTO2_PROFILING + pop_miss++; +#endif } } } } CYCLE_COUNT_LAP(sched_dispatch_cycle); +#if PTO2_PROFILING + if (profiling_enabled && phase_dispatch_count > 0) { + perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_DISPATCH, + _t0_phase, _t1, sched_loop_count, phase_dispatch_count); + _t0_phase = _t1; + phase_dispatch_count = 0; + } +#endif // Update perf header total_tasks if visible tasks have changed { @@ -617,6 +659,13 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, } } CYCLE_COUNT_LAP(sched_scan_cycle); +#if PTO2_PROFILING + if (profiling_enabled) { + perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_SCAN, + _t0_phase, _t1, sched_loop_count, 0); + _t0_phase = _t1; + } +#endif if (!made_progress) { idle_iterations++; @@ -680,44 +729,64 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx, SPIN_WAIT_HINT(); } PTO2_SPIN_PAUSE_LIGHT(); -#if PTO2_ORCH_PROFILING - sched_yield_count++; + CYCLE_COUNT_LAP(sched_idle_cycle); +#if PTO2_PROFILING + if (profiling_enabled) { + perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, + _t0_phase, _t1, sched_loop_count, 0); + _t0_phase = _t1; + } #endif - CYCLE_COUNT_LAP(sched_yield_cycle); } else { idle_iterations = 0; } } -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING + if (profiling_enabled) { uint64_t sched_total = - sched_scan_cycle + sched_orch_drain_cycle + sched_complete_cycle + sched_dispatch_cycle + sched_yield_cycle; + sched_complete_cycle + sched_scan_cycle + sched_dispatch_cycle + sched_idle_cycle; if (sched_total == 0) sched_total = 1; // avoid div-by-zero - DEV_ALWAYS("Thread %d: PTO2 scheduler stats: loops=%llu, completed=%d, total=%.3fus", + double tasks_per_loop = sched_loop_count > 0 ? (double)cur_thread_completed / sched_loop_count : 0.0; + DEV_ALWAYS("Thread %d: completed=%d tasks in %.3fus (%llu loops, %.1f tasks/loop)", thread_idx, - (unsigned long long)sched_loop_count, cur_thread_completed, - cycles_to_us(sched_total)); - DEV_ALWAYS( - "Thread %d: scan=%.3fus (%.1f%%), orch_drain=%.3fus (%.1f%%), complete=%.3fus (%.1f%%), dispatch=%.3fus " - "(%.1f%%)", + cycles_to_us(sched_total), + (unsigned long long)sched_loop_count, + tasks_per_loop); + DEV_ALWAYS("Thread %d: --- Phase Breakdown ---", thread_idx); + double notify_avg = cur_thread_completed > 0 + ? (double)notify_edges_total / cur_thread_completed : 0.0; + double fanin_avg = cur_thread_completed > 0 + ? (double)fanin_edges_total / cur_thread_completed : 0.0; + DEV_ALWAYS("Thread %d: complete: %.3fus (%.1f%%) [fanout: edges=%llu, max_degree=%d, avg=%.1f] [fanin: edges=%llu, max_degree=%d, avg=%.1f]", thread_idx, - cycles_to_us(sched_scan_cycle), - sched_scan_cycle * 100.0 / sched_total, - cycles_to_us(sched_orch_drain_cycle), - sched_orch_drain_cycle * 100.0 / sched_total, cycles_to_us(sched_complete_cycle), sched_complete_cycle * 100.0 / sched_total, + (unsigned long long)notify_edges_total, + notify_max_degree, + notify_avg, + (unsigned long long)fanin_edges_total, + fanin_max_degree, + fanin_avg); + DEV_ALWAYS("Thread %d: scan: %.3fus (%.1f%%)", + thread_idx, + cycles_to_us(sched_scan_cycle), + sched_scan_cycle * 100.0 / sched_total); + uint64_t pop_total = pop_hit + pop_miss; + double pop_hit_rate = pop_total > 0 ? pop_hit * 100.0 / pop_total : 0.0; + DEV_ALWAYS("Thread %d: dispatch: %.3fus (%.1f%%) [pop: hit=%llu, miss=%llu, hit_rate=%.1f%%]", + thread_idx, cycles_to_us(sched_dispatch_cycle), - sched_dispatch_cycle * 100.0 / sched_total); - DEV_ALWAYS("Thread %d: yield=%.3fus (%.1f%%, %llu calls, avg=%.1fus)", + sched_dispatch_cycle * 100.0 / sched_total, + (unsigned long long)pop_hit, + (unsigned long long)pop_miss, + pop_hit_rate); + DEV_ALWAYS("Thread %d: idle: %.3fus (%.1f%%)", thread_idx, - cycles_to_us(sched_yield_cycle), - sched_yield_cycle * 100.0 / sched_total, - (unsigned long long)sched_yield_count, - sched_yield_count > 0 ? cycles_to_us(sched_yield_cycle) / sched_yield_count : 0.0); - - DEV_ALWAYS("Thread %d: PTO2 execution complete, completed %d tasks", thread_idx, cur_thread_completed); + cycles_to_us(sched_idle_cycle), + sched_idle_cycle * 100.0 / sched_total); + } #endif // Flush performance buffers for cores managed by this thread @@ -911,54 +980,56 @@ int AicpuExecutor::run(Runtime* runtime) { cycles_to_us(orch_cycle_end - orch_cycle_start)); // Print orchestrator profiling data -#if PTO2_ORCH_PROFILING - { +#if PTO2_PROFILING + if (runtime->enable_profiling) { PTO2OrchProfilingData p = pto2_orchestrator_get_profiling(); uint64_t total = p.sync_cycle + p.alloc_cycle + p.params_cycle + p.lookup_cycle + p.heap_cycle + p.insert_cycle + p.fanin_cycle + p.finalize_cycle; - DEV_ALWAYS("=== Orchestrator Profiling: %lld tasks, total=%.3fus ===", + if (total == 0) total = 1; // avoid div-by-zero + DEV_ALWAYS("Thread 3: === Orchestrator Profiling: %lld tasks, total=%.3fus ===", (long long)p.submit_count, cycles_to_us(total)); - DEV_ALWAYS(" sync_tensormap : %.3fus (%.1f%%)", cycles_to_us(p.sync_cycle), p.sync_cycle * 100.0 / total); - DEV_ALWAYS(" task_ring_alloc: %.3fus (%.1f%%)", cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total); - DEV_ALWAYS(" param_copy : %.3fus (%.1f%%)", cycles_to_us(p.params_cycle), p.params_cycle * 100.0 / total); - DEV_ALWAYS(" lookup+dep : %.3fus (%.1f%%)", cycles_to_us(p.lookup_cycle), p.lookup_cycle * 100.0 / total); - DEV_ALWAYS(" heap_alloc : %.3fus (%.1f%%)", cycles_to_us(p.heap_cycle), p.heap_cycle * 100.0 / total); - DEV_ALWAYS(" tensormap_ins : %.3fus (%.1f%%)", cycles_to_us(p.insert_cycle), p.insert_cycle * 100.0 / total); - DEV_ALWAYS(" fanin+ready : %.3fus (%.1f%%)", cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total); - DEV_ALWAYS(" finalize+SM : %.3fus (%.1f%%)", cycles_to_us(p.finalize_cycle), p.finalize_cycle * 100.0 / total); - DEV_ALWAYS(" scope_end : %.3fus", cycles_to_us(p.scope_end_cycle)); - DEV_ALWAYS(" avg/task : %.3fus", cycles_to_us(total) / p.submit_count); - + DEV_ALWAYS("Thread 3: sync_tensormap : %.3fus (%.1f%%)", cycles_to_us(p.sync_cycle), p.sync_cycle * 100.0 / total); + DEV_ALWAYS("Thread 3: task_ring_alloc: %.3fus (%.1f%%)", cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total); + DEV_ALWAYS("Thread 3: param_copy : %.3fus (%.1f%%)", cycles_to_us(p.params_cycle), p.params_cycle * 100.0 / total); + DEV_ALWAYS("Thread 3: lookup+dep : %.3fus (%.1f%%)", cycles_to_us(p.lookup_cycle), p.lookup_cycle * 100.0 / total); + DEV_ALWAYS("Thread 3: heap_alloc : %.3fus (%.1f%%)", cycles_to_us(p.heap_cycle), p.heap_cycle * 100.0 / total); + DEV_ALWAYS("Thread 3: tensormap_ins : %.3fus (%.1f%%)", cycles_to_us(p.insert_cycle), p.insert_cycle * 100.0 / total); + DEV_ALWAYS("Thread 3: fanin+ready : %.3fus (%.1f%%)", cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total); + DEV_ALWAYS("Thread 3: finalize+SM : %.3fus (%.1f%%)", cycles_to_us(p.finalize_cycle), p.finalize_cycle * 100.0 / total); + DEV_ALWAYS("Thread 3: scope_end : %.3fus", cycles_to_us(p.scope_end_cycle)); + DEV_ALWAYS("Thread 3: avg/task : %.3fus", + p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0); + +#if PTO2_TENSORMAP_PROFILING PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling(); - DEV_ALWAYS("=== TensorMap Lookup Stats ==="); - DEV_ALWAYS(" lookups : %llu, inserts: %llu", + DEV_ALWAYS("Thread 3: === TensorMap Lookup Stats ==="); + DEV_ALWAYS("Thread 3: lookups : %llu, inserts: %llu", (unsigned long long)tp.lookup_count, (unsigned long long)tp.insert_count); - DEV_ALWAYS(" chain walked : total=%llu, avg=%.1f, max=%d", + DEV_ALWAYS("Thread 3: chain walked : total=%llu, avg=%.1f, max=%d", (unsigned long long)tp.lookup_chain_total, tp.lookup_count > 0 ? (double)tp.lookup_chain_total / tp.lookup_count : 0.0, tp.lookup_chain_max); - DEV_ALWAYS(" overlap checks : %llu, hits=%llu (%.1f%%)", + DEV_ALWAYS("Thread 3: overlap checks : %llu, hits=%llu (%.1f%%)", (unsigned long long)tp.overlap_checks, (unsigned long long)tp.overlap_hits, tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0); +#endif // Write orchestrator summary to shared memory for host-side export - if (runtime->enable_profiling) { - AicpuOrchSummary orch_summary = {}; - orch_summary.start_time = orch_cycle_start; - orch_summary.end_time = orch_cycle_end; - orch_summary.sync_cycle = p.sync_cycle; - orch_summary.alloc_cycle = p.alloc_cycle; - orch_summary.params_cycle = p.params_cycle; - orch_summary.lookup_cycle = p.lookup_cycle; - orch_summary.heap_cycle = p.heap_cycle; - orch_summary.insert_cycle = p.insert_cycle; - orch_summary.fanin_cycle = p.fanin_cycle; - orch_summary.finalize_cycle = p.finalize_cycle; - orch_summary.scope_end_cycle = p.scope_end_cycle; - orch_summary.submit_count = p.submit_count; - perf_aicpu_write_orch_summary(&orch_summary); - } + AicpuOrchSummary orch_summary = {}; + orch_summary.start_time = orch_cycle_start; + orch_summary.end_time = orch_cycle_end; + orch_summary.sync_cycle = p.sync_cycle; + orch_summary.alloc_cycle = p.alloc_cycle; + orch_summary.params_cycle = p.params_cycle; + orch_summary.lookup_cycle = p.lookup_cycle; + orch_summary.heap_cycle = p.heap_cycle; + orch_summary.insert_cycle = p.insert_cycle; + orch_summary.fanin_cycle = p.fanin_cycle; + orch_summary.finalize_cycle = p.finalize_cycle; + orch_summary.scope_end_cycle = p.scope_end_cycle; + orch_summary.submit_count = p.submit_count; + perf_aicpu_write_orch_summary(&orch_summary); } #endif @@ -975,7 +1046,7 @@ int AicpuExecutor::run(Runtime* runtime) { PTO2SharedMemoryHeader* sm_header = static_cast(sm); int32_t pto2_task_count = sm_header ? sm_header->current_task_index.load(std::memory_order_acquire) : 0; - DEV_ALWAYS("PTO2 total submitted tasks = %d", pto2_task_count); + DEV_ALWAYS("Thread 3: PTO2 total submitted tasks = %d", pto2_task_count); total_tasks_.store(pto2_task_count, std::memory_order_release); orchestrator_done_.store(true, std::memory_order_release); DEV_INFO("Thread 3: Set orchestrator_done=true, waiting for scheduler threads"); diff --git a/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md b/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md index f18402f5b..4d8d8dc11 100644 --- a/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md +++ b/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md @@ -37,18 +37,18 @@ Thread 3 loads the orchestration `.so` via `dlopen`, calls `aicpu_orchestration_ Thread 3: Calling aicpu_orchestration_entry from SO aicpu_orchestration_entry ">>>>>> batch = 64" Thread 3: aicpu_orchestration_entry returned, cost 20943.940us -=== Orchestrator Profiling: 16704 tasks, total=14601.580us === - sync_tensormap : 286.300us (2.0%) - task_ring_alloc: 380.400us (2.6%) - param_copy : 2147.800us (14.7%) - lookup+dep : 7290.300us (49.9%) - heap_alloc : 701.500us (4.8%) - tensormap_ins : 1890.380us (12.9%) - fanin+ready : 1207.400us (8.3%) - finalize+SM : 697.500us (4.8%) - scope_end : 364.080us - avg/task : 0.874us -PTO2 total submitted tasks = 16704 +Thread 3: === Orchestrator Profiling: 16704 tasks, total=14601.580us === +Thread 3: sync_tensormap : 286.300us (2.0%) +Thread 3: task_ring_alloc: 380.400us (2.6%) +Thread 3: param_copy : 2147.800us (14.7%) +Thread 3: lookup+dep : 7290.300us (49.9%) +Thread 3: heap_alloc : 701.500us (4.8%) +Thread 3: tensormap_ins : 1890.380us (12.9%) +Thread 3: fanin+ready : 1207.400us (8.3%) +Thread 3: finalize+SM : 697.500us (4.8%) +Thread 3: scope_end : 364.080us +Thread 3: avg/task : 0.874us +Thread 3: PTO2 total submitted tasks = 16704 ``` ### Field Reference @@ -79,26 +79,17 @@ PTO2 total submitted tasks = 16704 ## Block 2: PTO2 Scheduler Summary -Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has three sub-sections: **summary**, **phase breakdown**, and **lock contention**. +Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has two sub-sections: **summary** and **phase breakdown**. -### Example (Thread 0, from the same run) +### Example (Thread 0, from a different run: batch=1, 1044 tasks) ``` -Thread 0: === PTO2 Scheduler Summary === -Thread 0: completed=6068 tasks in 31398us (977 loops, 6.2 tasks/loop) -Thread 0: --- Phase Breakdown (execution order) --- -Thread 0: scan: 2295us ( 7.3%) -Thread 0: early_ready: 77us ( 0.2%) (deps already met at submit time) -Thread 0: complete: 11374us (36.2%) [fanout: edges=7578, max_degree=20, avg=1.2] -Thread 0: dispatch: 17651us (56.2%) [steal: own=4443, steal=1625, pct=26.8%] -Thread 0: --- Lock Contention (ready_q) --- -Thread 0: total: wait= 8366us hold= 4144us -Thread 0: scan: wait= 318us hold= 704us -Thread 0: early_ready: wait= 0us hold= 0us -Thread 0: complete: wait= 1374us hold= 781us -Thread 0: dispatch: wait= 6674us hold= 2659us -Thread 0: hit: wait= 1361us hold= 551us (dequeued task) -Thread 0: miss: wait= 5313us hold= 2108us (empty queue) +Thread 0: completed=352 tasks in 3477.420us (147 loops, 2.4 tasks/loop) +Thread 0: --- Phase Breakdown --- +Thread 0: complete: 1485.020us (42.7%) [fanout: edges=432, max_degree=2, avg=1.2] [fanin: edges=320, max_degree=3, avg=0.9] +Thread 0: scan: 14.400us (0.4%) +Thread 0: dispatch: 1973.060us (56.7%) [pop: hit=352, miss=3043, hit_rate=10.4%] +Thread 0: idle: 4.940us (0.1%) ``` ### Summary Line @@ -116,52 +107,26 @@ Thread N: completed=X tasks in Yus (Z loops, W tasks/loop) ### Phase Breakdown -The scheduler loop runs four phases in order each iteration. Each phase's time is accumulated across all loop iterations. +The scheduler loop runs four phases each iteration. Each phase's time is accumulated across all loop iterations. | Phase | What it does | Inline stats | |-------|-------------|-------------| -| **scan** | Scans newly submitted tasks in shared memory; enqueues root tasks (those with `fanin_count == 0`) into the ready queue | — | -| **early_ready** | Drains the orchestrator's ready queue (tasks whose dependencies were all satisfied at submit time, detected via Step 5b in `pto2_submit_task`) | — | -| **complete** | Polls register `COND` on each managed core; when a core becomes `IDLE`, traverses the completed task's fanout list, increments consumer refcounts, and enqueues newly ready consumers | `edges`: total fanout edges traversed; `max_degree`: largest fanout list; `avg`: average fanout per completed task | -| **dispatch** | For each idle core, pops a task from the ready queue (own shard first, then work-stealing from other shards), builds the dispatch payload, and writes the task to the core's register | `own`: tasks dequeued from own shard; `steal`: tasks stolen from other shards; `pct`: steal percentage | +| **complete** | Polls handshake on each managed core; when a core completes, traverses fanout list (notify consumers) and fanin list (release producers) via `on_task_complete` | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release | +| **scan** | Updates the perf profiling header with latest scheduler state | — | +| **dispatch** | For each idle core, pops a task from the ready queue via `pto2_scheduler_get_ready_task`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) | +| **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — | **Interpreting phase percentages:** -- **dispatch** is typically the largest (~55%) because it includes both ready-queue pops (with lock contention) and the actual register writes + payload construction. -- **complete** is the second largest (~36%) because fanout traversal involves atomic operations (`SEQ_CST` fetch_add on refcounts) and conditional ready-queue pushes. -- **scan** is small (~7%) — it only runs until all submitted tasks have been scanned. -- **early_ready** is negligible in most cases. +- **dispatch** is typically the largest (~55-60%) because it includes ready-queue pops (with spinlock), payload construction, and cache flush (`dc cvac` + `dsb sy`). +- **complete** is the second largest (~40-45%) because it traverses both fanout (CAS-based fanin decrement, conditional ready-queue push) and fanin (release_producer, check_consumed, ring pointer advancement). +- **scan** is small (<1%) — only updates the perf header. +- **idle** is negligible when tasks are flowing; high idle% indicates the scheduler is starved. -### Lock Contention (ready_q) +**Interpreting pop hit_rate:** -Ready queues are sharded (one shard per scheduler thread). Access is protected by per-shard spinlocks. This section reports cumulative lock **wait** (time spinning to acquire) and **hold** (time from acquire to release) for each phase. - -``` -Thread N: total: wait=Xus hold=Yus # sum across all phases -Thread N: scan: wait=Xus hold=Yus # lock during root-task enqueue -Thread N: early_ready: wait=Xus hold=Yus # lock during orch-ready drain -Thread N: complete: wait=Xus hold=Yus # lock during fanout push -Thread N: dispatch: wait=Xus hold=Yus # lock during ready-queue pop (sum of hit+miss) -Thread N: hit: wait=Xus hold=Yus # pop attempts that dequeued a task -Thread N: miss: wait=Xus hold=Yus # pop attempts on empty queue -``` - -**Key observations from the example:** - -| Metric | Thread 0 | Thread 1 | Thread 2 | -|--------|----------|----------|----------| -| completed | 6068 | 5997 | 4639 | -| total time (us) | 31398 | 31410 | 31406 | -| dispatch % | 56.2% | 56.0% | 60.5% | -| complete % | 36.2% | 36.7% | 35.4% | -| steal % | 26.8% | 25.8% | 41.4% | -| lock wait (us) | 8366 | 9176 | 8807 | -| lock hold (us) | 4144 | 4688 | 3971 | - -- **Lock contention is moderate**: total wait ~8-9ms out of ~31ms total time (~27-30%). -- **dispatch.miss dominates wait time**: most lock wait comes from polling empty queues, not actual contention. Thread 0's dispatch miss wait = 5313us vs hit wait = 1361us. -- **Work stealing is active**: Thread 2 steals 41.4% of its tasks, indicating it finishes its own shard's tasks faster and helps drain other shards. -- **Threads are well-balanced**: all three complete within ~31ms, despite different task counts (Thread 2 has fewer tasks but higher steal rate). +- **High hit_rate (>50%)**: Ready queue is well-supplied; dispatch is efficient. +- **Low hit_rate (<10%)**: Ready queue is mostly empty when cores become idle. The bottleneck is upstream (orchestrator submission speed or fanout resolution latency), not dispatch itself. ### Per-Task Averages @@ -169,10 +134,9 @@ Divide each thread's phase times by its `completed` count to get per-task schedu | Metric | Formula | Typical value | |--------|---------|---------------| -| Scheduling overhead per task | total_time / completed | ~5-7 us/task | -| Lock overhead per task | (total_wait + total_hold) / completed | ~1.5-2.5 us/task | -| Dispatch per task | dispatch_time / completed | ~3-4 us/task | -| Complete per task | complete_time / completed | ~2-3 us/task | +| Scheduling overhead per task | total_time / completed | ~5-10 us/task | +| Dispatch per task | dispatch_time / completed | ~3-6 us/task | +| Complete per task | complete_time / completed | ~2-4 us/task | --- @@ -186,7 +150,7 @@ When `--enable-profiling` is used, the host terminal prints a **Task Statistics | Avg scheduling overhead | `sum(thread_total) / total_tasks` (device log) | Time AICPU spends scheduling each task | | Sched/Exec ratio | scheduling / execution | Scheduling overhead relative to kernel execution | -A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's non-lock paths (dispatch polling, fanout traversal) before reducing lock contention. +A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's dispatch hot path (cache flush, payload construction) or upstream task flow. --- @@ -196,12 +160,9 @@ A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates # Find the latest device log for device 2 ls -t $HOME/ascend/log/debug/device-2/device-*.log | head -1 -# Extract orchestrator profiling -grep -E "Orchestrator Profiling|sync_tensormap|task_ring_alloc|param_copy|lookup\+dep|heap_alloc|tensormap_ins|fanin\+ready|finalize\+SM|scope_end|avg/task" - -# Extract scheduler summary -grep -E "Scheduler Summary|completed=|Phase Breakdown|scan:|early_ready:|complete:|dispatch:" +# Extract orchestrator profiling (Thread 3) +grep "Thread 3:" -# Extract lock contention -grep -E "Lock Contention|total:|scan:|early_ready:|complete:|dispatch:|hit:|miss:" +# Extract scheduler profiling (Threads 0/1/2) +grep -E "Thread [012]:" ``` diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 294e7c709..2e8964eb3 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -23,7 +23,7 @@ // ============================================================================= // Orchestrator Profiling (compile-time toggle) // ============================================================================= -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING #include "aicpu/device_time.h" #include "aicpu/performance_collector_aicpu.h" // Weak fallback for builds that don't link device_time.cpp (e.g. host). @@ -177,7 +177,7 @@ void pto2_scope_begin(PTO2OrchestratorState* orch) { void pto2_scope_end(PTO2OrchestratorState* orch) { assert(orch->scope_stack_top >= 0 && "Scope stack underflow"); -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING uint64_t _se0 = get_sys_cnt_aicpu(); #endif @@ -191,7 +191,7 @@ void pto2_scope_end(PTO2OrchestratorState* orch) { // Rewind the task buffer — these entries are no longer needed orch->scope_tasks_size = begin; -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING uint64_t _se1 = get_sys_cnt_aicpu(); g_orch_scope_end_cycle += (_se1 - _se0); perf_aicpu_record_orch_phase(AicpuPhaseId::ORCH_SCOPE_END, _se0, _se1, g_orch_submit_idx); @@ -401,7 +401,7 @@ void pto2_submit_task( CYCLE_COUNT_LAP_RECORD(g_orch_finalize_cycle, AicpuPhaseId::ORCH_FINALIZE); orch->tasks_submitted++; -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING g_orch_submit_count++; g_orch_submit_idx++; #endif @@ -460,7 +460,7 @@ void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState* orch) { LOG_INFO("=================="); } -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING PTO2OrchProfilingData pto2_orchestrator_get_profiling() { PTO2OrchProfilingData d; d.sync_cycle = g_orch_sync_cycle; diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index e4ce51a40..c63e12a98 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -229,11 +229,11 @@ void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState* orch); // Orchestrator Profiling Data // ============================================================================= -#ifndef PTO2_ORCH_PROFILING -#define PTO2_ORCH_PROFILING 1 +#ifndef PTO2_PROFILING +#define PTO2_PROFILING 1 #endif -#if PTO2_ORCH_PROFILING +#if PTO2_PROFILING struct PTO2OrchProfilingData { uint64_t sync_cycle; uint64_t alloc_cycle; diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index 0e4b7bdd7..b88c21f65 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -18,6 +18,12 @@ #ifndef PTO_SCHEDULER_H #define PTO_SCHEDULER_H +// PTO2_PROFILING must be defined before inline methods that use it. +// Default ON; disable with -DPTO2_PROFILING=0. +#ifndef PTO2_PROFILING +#define PTO2_PROFILING 1 +#endif + #include #include "pto_runtime2_types.h" @@ -126,6 +132,15 @@ void pto2_ready_queue_reset(PTO2ReadyQueue* queue); // Scheduler State // ============================================================================= +/** + * Statistics returned by on_task_complete + */ +struct PTO2CompletionStats { + int32_t fanout_edges; // Number of fanout edges traversed (notify consumers) + int32_t tasks_enqueued; // Number of consumers that became READY + int32_t fanin_edges; // Number of fanin edges traversed (release producers) +}; + /** * Scheduler state structure * @@ -235,7 +250,7 @@ struct PTO2SchedulerState { check_and_handle_consumed(producer_id, producer); } - void release_fanin_and_check_ready(int32_t task_id, + bool release_fanin_and_check_ready(int32_t task_id, PTO2TaskDescriptor* task) { int32_t slot = pto2_task_slot(task_id); @@ -249,8 +264,10 @@ struct PTO2SchedulerState { if (task_state[slot].compare_exchange_strong( expected, PTO2_TASK_READY, std::memory_order_acq_rel, std::memory_order_acquire)) { ready_queues[task->worker_type].push(task_id); + return true; } } + return false; } void init_task(int32_t task_id, PTO2TaskDescriptor* task) { @@ -289,10 +306,10 @@ struct PTO2SchedulerState { } } - int32_t on_task_complete(int32_t task_id) { + PTO2CompletionStats on_task_complete(int32_t task_id) { + PTO2CompletionStats stats = {0, 0, 0}; int32_t slot = pto2_task_slot(task_id); PTO2TaskDescriptor* task = pto2_sm_get_task(sm_handle, task_id); - int32_t fanout_notified = 0; tasks_completed.fetch_add(1, std::memory_order_relaxed); pto2_fanout_lock(task); @@ -301,10 +318,16 @@ struct PTO2SchedulerState { pto2_fanout_unlock(task); while (current != nullptr) { - fanout_notified++; int32_t consumer_id = current->task_id; PTO2TaskDescriptor* consumer = pto2_sm_get_task(sm_handle, consumer_id); +#if PTO2_PROFILING + stats.fanout_edges++; + if (release_fanin_and_check_ready(consumer_id, consumer)) { + stats.tasks_enqueued++; + } +#else release_fanin_and_check_ready(consumer_id, consumer); +#endif current = current->next; } @@ -312,11 +335,14 @@ struct PTO2SchedulerState { while (current != nullptr) { int32_t producer_id = current->task_id; release_producer(producer_id); +#if PTO2_PROFILING + stats.fanin_edges++; +#endif current = current->next; } check_and_handle_consumed(task_id, task); - return fanout_notified; + return stats; } }; diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp index c69aa73fc..5e22297cc 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp @@ -25,13 +25,13 @@ // ============================================================================= // TensorMap Lookup Chain Length Statistics (compile-time toggle) // ============================================================================= -#if PTO2_ORCH_PROFILING -static uint64_t g_lookup_chain_total = 0; -static uint64_t g_lookup_count = 0; -static int32_t g_lookup_chain_max = 0; -static uint64_t g_lookup_overlap_checks = 0; -static uint64_t g_lookup_overlap_hits = 0; -static uint64_t g_insert_count = 0; +#if PTO2_TENSORMAP_PROFILING +uint64_t g_lookup_chain_total = 0; +uint64_t g_lookup_count = 0; +int32_t g_lookup_chain_max = 0; +uint64_t g_lookup_overlap_checks = 0; +uint64_t g_lookup_overlap_hits = 0; +uint64_t g_insert_count = 0; #endif // ============================================================================= @@ -253,7 +253,7 @@ void PTO2TensorMap::sync_tensormap() { // ============================================================================= // TensorMap Lookup Profiling // ============================================================================= -#if PTO2_ORCH_PROFILING +#if PTO2_TENSORMAP_PROFILING PTO2TensorMapProfilingData pto2_tensormap_get_profiling() { PTO2TensorMapProfilingData d; d.lookup_chain_total = g_lookup_chain_total; diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 7b22c6e62..7e4d7ce06 100644 --- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -38,6 +38,22 @@ struct PTO2OrchestratorState; // forward declare +// ============================================================================= +// TensorMap Lookup Profiling (must precede inline lookup/insert methods) +// ============================================================================= +#ifndef PTO2_TENSORMAP_PROFILING +#define PTO2_TENSORMAP_PROFILING 0 +#endif + +#if PTO2_TENSORMAP_PROFILING +extern uint64_t g_lookup_chain_total; +extern uint64_t g_lookup_count; +extern int32_t g_lookup_chain_max; +extern uint64_t g_lookup_overlap_checks; +extern uint64_t g_lookup_overlap_hits; +extern uint64_t g_insert_count; +#endif + // ============================================================================= // TensorMap Structure // ============================================================================= @@ -212,8 +228,15 @@ struct PTO2TensorMap { PTO2TensorMapEntry* cur_entry = *prev_ptr; result.count = 0; +#if PTO2_TENSORMAP_PROFILING + g_lookup_count++; + int32_t chain_len = 0; +#endif while (cur_entry != nullptr) { +#if PTO2_TENSORMAP_PROFILING + chain_len++; +#endif // Check validity first if (!entry_valid(*cur_entry)) { // ========== STALE ENTRY: Truncate chain here ========== @@ -227,16 +250,22 @@ struct PTO2TensorMap { remove_entry(*cur_entry); cur_entry = next_entry; } - return; + break; } // Entry is valid - check if regions OVERLAP (not just exact match) // Since we hash only by base_ptr, all entries in this bucket have // potential to overlap. We must check actual byte-range overlap. if (query_tensor_data.buffer.addr == cur_entry->addr) { +#if PTO2_TENSORMAP_PROFILING + g_lookup_overlap_checks++; +#endif auto overlap_status = query_tensor_data.is_overlap(cur_entry->tensor.data()); if (overlap_status != OverlapStatus::NO_OVERLAP) { result.push(cur_entry, overlap_status); +#if PTO2_TENSORMAP_PROFILING + g_lookup_overlap_hits++; +#endif } } @@ -244,6 +273,10 @@ struct PTO2TensorMap { prev_ptr = &cur_entry->next_in_bucket; cur_entry = *prev_ptr; } +#if PTO2_TENSORMAP_PROFILING + g_lookup_chain_total += chain_len; + if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len; +#endif } /** @@ -256,6 +289,9 @@ struct PTO2TensorMap { * @param producer_task_id Task ID of producer */ void insert(const Tensor& tensor, int32_t producer_task_id, bool with_alloc) { +#if PTO2_TENSORMAP_PROFILING + g_insert_count++; +#endif // Allocate entry from ring buffer pool PTO2TensorMapEntry* entry = new_entry(); @@ -394,14 +430,7 @@ struct PTO2TensorMap { void sync_tensormap(); }; -// ============================================================================= -// TensorMap Lookup Profiling -// ============================================================================= -#ifndef PTO2_ORCH_PROFILING -#define PTO2_ORCH_PROFILING 1 -#endif - -#if PTO2_ORCH_PROFILING +#if PTO2_TENSORMAP_PROFILING struct PTO2TensorMapProfilingData { uint64_t lookup_chain_total; uint64_t lookup_count; diff --git a/tools/sched_overhead_analysis.py b/tools/sched_overhead_analysis.py index e6ea363df..51e9b35af 100644 --- a/tools/sched_overhead_analysis.py +++ b/tools/sched_overhead_analysis.py @@ -37,19 +37,11 @@ def parse_scheduler_threads(log_path): Expected log format (per thread): Thread N: completed=X tasks in Yus (Z loops, W tasks/loop) - Thread N: --- Phase Breakdown (execution order) --- - Thread N: scan: Xus (Y%) - Thread N: early_ready: Xus (Y%) (deps already met at submit time) - Thread N: complete: Xus (Y%) [fanout: edges=A, max_degree=B, avg=C] - Thread N: dispatch: Xus (Y%) [steal: own=A, steal=B, pct=C%] - Thread N: --- Lock Contention (ready_q) --- - Thread N: total: wait= Xus hold= Yus - Thread N: scan: wait= Xus hold= Yus - Thread N: early_ready: wait= Xus hold= Yus - Thread N: complete: wait= Xus hold= Yus - Thread N: dispatch: wait= Xus hold= Yus - Thread N: hit: wait= Xus hold= Yus (dequeued task) - Thread N: miss: wait= Xus hold= Yus (empty queue) + Thread N: --- Phase Breakdown --- + Thread N: complete: Xus (Y%) [fanout: edges=A, max_degree=B, avg=C] [fanin: edges=D, max_degree=E, avg=F] + Thread N: scan: Xus (Y%) + Thread N: dispatch: Xus (Y%) [pop: hit=A, miss=B, hit_rate=C%] + Thread N: idle: Xus (Y%) """ threads = {} with open(log_path, 'r', errors='ignore') as f: @@ -65,24 +57,12 @@ def parse_scheduler_threads(log_path): 'tasks_per_loop': float(m.group(5)), } - # Phase: scan (distinguished from lock scan by absence of "wait=") - m = re.search(r'Thread (\d+):\s+scan:\s+([\d.]+)us \(\s*([\d.]+)%\)', line) - if m: - tid = int(m.group(1)) - if tid in threads: - threads[tid]['scan_us'] = float(m.group(2)) - threads[tid]['scan_pct'] = float(m.group(3)) - - # Phase: early_ready - m = re.search(r'Thread (\d+):\s+early_ready:\s+([\d.]+)us \(\s*([\d.]+)%\)', line) - if m: - tid = int(m.group(1)) - if tid in threads: - threads[tid]['early_ready_us'] = float(m.group(2)) - threads[tid]['early_ready_pct'] = float(m.group(3)) - - # Phase: complete [fanout: edges=X, max_degree=Y, avg=Z] - m = re.search(r'Thread (\d+):\s+complete:\s+([\d.]+)us \(\s*([\d.]+)%\)\s+\[fanout: edges=(\d+), max_degree=(\d+), avg=([\d.]+)\]', line) + # Phase: complete [fanout: edges=X, max_degree=Y, avg=Z] [fanin: edges=D, max_degree=E, avg=F] + m = re.search( + r'Thread (\d+):\s+complete:\s+([\d.]+)us \(\s*([\d.]+)%\)' + r'\s+\[fanout: edges=(\d+), max_degree=(\d+), avg=([\d.]+)\]' + r'\s+\[fanin: edges=(\d+), max_degree=(\d+), avg=([\d.]+)\]', + line) if m: tid = int(m.group(1)) if tid in threads: @@ -91,73 +71,36 @@ def parse_scheduler_threads(log_path): threads[tid]['fanout_edges'] = int(m.group(4)) threads[tid]['fanout_max_degree'] = int(m.group(5)) threads[tid]['fanout_avg'] = float(m.group(6)) + threads[tid]['fanin_edges'] = int(m.group(7)) + threads[tid]['fanin_max_degree'] = int(m.group(8)) + threads[tid]['fanin_avg'] = float(m.group(9)) - # Phase: dispatch [steal: own=X, steal=Y, pct=Z%] - m = re.search(r'Thread (\d+):\s+dispatch:\s+([\d.]+)us \(\s*([\d.]+)%\)\s+\[steal: own=(\d+), steal=(\d+), pct=([\d.]+)%\]', line) - if m: - tid = int(m.group(1)) - if tid in threads: - threads[tid]['dispatch_us'] = float(m.group(2)) - threads[tid]['dispatch_pct'] = float(m.group(3)) - threads[tid]['steal_own'] = int(m.group(4)) - threads[tid]['steal_steal'] = int(m.group(5)) - threads[tid]['steal_pct'] = float(m.group(6)) - - # Lock: total - m = re.search(r'Thread (\d+):\s+total:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line) - if m: - tid = int(m.group(1)) - if tid in threads: - threads[tid]['lock_wait_us'] = int(m.group(2)) - threads[tid]['lock_hold_us'] = int(m.group(3)) - - # Lock: scan - m = re.search(r'Thread (\d+):\s+scan:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line) - if m: - tid = int(m.group(1)) - if tid in threads: - threads[tid]['lock_scan_wait'] = int(m.group(2)) - threads[tid]['lock_scan_hold'] = int(m.group(3)) - - # Lock: early_ready - m = re.search(r'Thread (\d+):\s+early_ready:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line) - if m: - tid = int(m.group(1)) - if tid in threads: - threads[tid]['lock_early_ready_wait'] = int(m.group(2)) - threads[tid]['lock_early_ready_hold'] = int(m.group(3)) - - # Lock: complete - m = re.search(r'Thread (\d+):\s+complete:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line) - if m: - tid = int(m.group(1)) - if tid in threads: - threads[tid]['lock_complete_wait'] = int(m.group(2)) - threads[tid]['lock_complete_hold'] = int(m.group(3)) - - # Lock: dispatch - m = re.search(r'Thread (\d+):\s+dispatch:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line) + # Phase: scan + m = re.search(r'Thread (\d+):\s+scan:\s+([\d.]+)us \(\s*([\d.]+)%\)', line) if m: tid = int(m.group(1)) if tid in threads: - threads[tid]['lock_dispatch_wait'] = int(m.group(2)) - threads[tid]['lock_dispatch_hold'] = int(m.group(3)) + threads[tid]['scan_us'] = float(m.group(2)) + threads[tid]['scan_pct'] = float(m.group(3)) - # Lock: dispatch hit - m = re.search(r'Thread (\d+):\s+hit:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line) + # Phase: dispatch [pop: hit=X, miss=Y, hit_rate=Z%] + m = re.search(r'Thread (\d+):\s+dispatch:\s+([\d.]+)us \(\s*([\d.]+)%\)\s+\[pop: hit=(\d+), miss=(\d+), hit_rate=([\d.]+)%\]', line) if m: tid = int(m.group(1)) if tid in threads: - threads[tid]['lock_dispatch_hit_wait'] = int(m.group(2)) - threads[tid]['lock_dispatch_hit_hold'] = int(m.group(3)) + threads[tid]['dispatch_us'] = float(m.group(2)) + threads[tid]['dispatch_pct'] = float(m.group(3)) + threads[tid]['pop_hit'] = int(m.group(4)) + threads[tid]['pop_miss'] = int(m.group(5)) + threads[tid]['pop_hit_rate'] = float(m.group(6)) - # Lock: dispatch miss - m = re.search(r'Thread (\d+):\s+miss:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line) + # Phase: idle + m = re.search(r'Thread (\d+):\s+idle:\s+([\d.]+)us \(\s*([\d.]+)%\)', line) if m: tid = int(m.group(1)) if tid in threads: - threads[tid]['lock_dispatch_miss_wait'] = int(m.group(2)) - threads[tid]['lock_dispatch_miss_hold'] = int(m.group(3)) + threads[tid]['idle_us'] = float(m.group(2)) + threads[tid]['idle_pct'] = float(m.group(3)) return threads @@ -304,12 +247,12 @@ def run_analysis(perf_path, log_path, print_sources=True, selection_strategy=Non print() # Phase breakdown - phases = ['scan', 'early_ready', 'complete', 'dispatch'] + phases = ['complete', 'scan', 'dispatch', 'idle'] phase_labels = { - 'scan': 'Scan (discover new root tasks)', - 'early_ready': 'Early ready (deps met at submit time)', - 'complete': 'Complete (poll handshake, resolve fanout)', + 'complete': 'Complete (poll handshake, resolve deps)', + 'scan': 'Scan (update perf header)', 'dispatch': 'Dispatch (pop queue, build payload, flush)', + 'idle': 'Idle (spinning, no progress)', } fmt3 = " {:<50} {:>11} {:>10} {:>14}" @@ -328,42 +271,22 @@ def run_analysis(perf_path, log_path, print_sources=True, selection_strategy=Non # Fanout stats (from complete phase) fanout_edges = sum(t.get('fanout_edges', 0) for t in threads.values()) fanout_max = max((t.get('fanout_max_degree', 0) for t in threads.values()), default=0) - fanout_avg_weighted = sum(t.get('fanout_avg', 0) * t.get('fanout_edges', 0) for t in threads.values()) - fanout_avg = fanout_avg_weighted / fanout_edges if fanout_edges > 0 else 0 - print(f' Fanout: total edges={fanout_edges}, max_degree={fanout_max}, avg_degree={fanout_avg:.1f}') - print() - - # Work stealing stats (from dispatch phase) - steal_own = sum(t.get('steal_own', 0) for t in threads.values()) - steal_steal = sum(t.get('steal_steal', 0) for t in threads.values()) - steal_total = steal_own + steal_steal - steal_pct = steal_steal / steal_total * 100 if steal_total > 0 else 0 - print(f' Work stealing: own={steal_own}, stolen={steal_steal} ({steal_pct:.1f}% steal rate)') - print() - - # Lock contention breakdown - fmt4 = " {:<50} {:>11} {:>10}" - print(fmt4.format('Lock contention (ready_q)', 'Total (us)', '% of total')) - print(' ' + '-' * 75) - lock_wait = sum(t.get('lock_wait_us', 0) for t in threads.values()) - lock_hold = sum(t.get('lock_hold_us', 0) for t in threads.values()) - print(fmt4.format(' wait (spinning for lock)', str(lock_wait), f'{lock_wait/total_us*100:.1f}%' if total_us > 0 else '0.0%')) - print(fmt4.format(' hold (inside critical section)', str(lock_hold), f'{lock_hold/total_us*100:.1f}%' if total_us > 0 else '0.0%')) + fanout_avg = fanout_edges / total_completed if total_completed > 0 else 0 + print(f' Fanout (notify consumers): total edges={fanout_edges}, max_degree={fanout_max}, avg_degree={fanout_avg:.1f}') + + # Fanin stats (from complete phase) + fanin_edges = sum(t.get('fanin_edges', 0) for t in threads.values()) + fanin_max = max((t.get('fanin_max_degree', 0) for t in threads.values()), default=0) + fanin_avg = fanin_edges / total_completed if total_completed > 0 else 0 + print(f' Fanin (release producers): total edges={fanin_edges}, max_degree={fanin_max}, avg_degree={fanin_avg:.1f}') print() - # Lock wait breakdown by phase - print(' Lock wait by phase:') - for p in phases: - w = sum(t.get(f'lock_{p}_wait', 0) for t in threads.values()) - h = sum(t.get(f'lock_{p}_hold', 0) for t in threads.values()) - print(f' {p:<14} wait={w:>6} us hold={h:>6} us') - # Dispatch hit/miss sub-breakdown - hit_w = sum(t.get('lock_dispatch_hit_wait', 0) for t in threads.values()) - hit_h = sum(t.get('lock_dispatch_hit_hold', 0) for t in threads.values()) - miss_w = sum(t.get('lock_dispatch_miss_wait', 0) for t in threads.values()) - miss_h = sum(t.get('lock_dispatch_miss_hold', 0) for t in threads.values()) - print(f' {"hit":<12} wait={hit_w:>6} us hold={hit_h:>6} us (dequeued task)') - print(f' {"miss":<12} wait={miss_w:>6} us hold={miss_h:>6} us (empty queue)') + # Pop stats (from dispatch phase) + pop_hit = sum(t.get('pop_hit', 0) for t in threads.values()) + pop_miss = sum(t.get('pop_miss', 0) for t in threads.values()) + pop_total = pop_hit + pop_miss + pop_hit_rate = pop_hit / pop_total * 100 if pop_total > 0 else 0 + print(f' Pop: hit={pop_hit}, miss={pop_miss}, hit_rate={pop_hit_rate:.1f}%') print() print('=' * 90) @@ -391,75 +314,29 @@ def run_analysis(perf_path, log_path, print_sources=True, selection_strategy=Non avg_tail_oh = sum(tails) / n loop_ratio = avg_tail_oh / avg_loop_us if avg_loop_us > 0 else 0 print(f' Avg scheduler loop iteration: {avg_loop_us:.1f} us (approx avg polling interval per loop)') - if n_threads > 0: - print(f' With {n_threads} threads sharing {total_loops} loops over {total_us/n_threads:.0f} us wall each:') - print() - - print(' Scheduler CPU time breakdown (per completed task):') - - # Build phase data with sub-items for sorting - phase_details = { - 'dispatch': { - 'label': 'Dispatch phase (build payload + cache flush)', - 'total': phase_totals.get('dispatch', 0), - 'sub_items': [ - ('Lock wait (ready_q pop)', sum(t.get('lock_dispatch_wait', 0) for t in threads.values())), - ('Lock hold + build + dc cvac/civac + dsb sy', phase_totals.get('dispatch', 0) - sum(t.get('lock_dispatch_wait', 0) for t in threads.values())), - ] - }, - 'complete': { - 'label': 'Complete phase (poll + fanout resolve)', - 'total': phase_totals.get('complete', 0), - 'sub_items': [ - ('Lock wait (ready_q push)', sum(t.get('lock_complete_wait', 0) for t in threads.values())), - ('Fanout traversal + atomic ops', phase_totals.get('complete', 0) - sum(t.get('lock_complete_wait', 0) for t in threads.values())), - ] - }, - 'scan': { - 'label': 'Scan phase (new task discovery)', - 'total': phase_totals.get('scan', 0), - 'sub_items': [] - }, - 'early_ready': { - 'label': 'Early ready (deps met at submit time)', - 'total': phase_totals.get('early_ready', 0), - 'sub_items': [] - }, - } - - # Sort by total descending - for _, detail in sorted(phase_details.items(), key=lambda x: x[1]['total'], reverse=True): - per_task = detail['total'] / total_completed if total_completed > 0 else 0 - pct = detail['total'] / total_us * 100 if total_us > 0 else 0 - print(f' - {detail["label"]:<50} {per_task:.2f} us/task ({pct:.1f}% of scheduler CPU)') - for sub_label, sub_total in detail['sub_items']: - sub_per_task = sub_total / total_completed if total_completed > 0 else 0 - print(f' {sub_label:<48} {sub_per_task:.2f} us/task') - print() print(f' Avg Tail OH = {avg_tail_oh:.1f} us ~= {loop_ratio:.1f} x avg loop iteration ({avg_loop_us:.1f} us)') print(f' -> On average, a completed task waits ~{loop_ratio:.1f} loop iterations before being detected') print() - # Data-driven insight: find the dominant phase (excluding early_ready which is typically trivial) + # Data-driven insight: find the dominant phase (excluding idle which is not useful work) work_phases = {p: phase_totals.get(p, 0) for p in ['scan', 'complete', 'dispatch']} dominant_phase = max(work_phases, key=work_phases.get) dominant_pct = work_phases[dominant_phase] / total_us * 100 if total_us > 0 else 0 print(f' Key insight: {phase_labels[dominant_phase].split(" (")[0]} phase consumes ~{dominant_pct:.0f}% of scheduler CPU.') if dominant_phase == 'dispatch': - dispatch_total = phase_totals.get('dispatch', 0) - dispatch_lock_pct = sum(t.get('lock_dispatch_wait', 0) for t in threads.values()) / dispatch_total * 100 if dispatch_total > 0 else 0 - print(f' Within dispatch, lock contention accounts for {dispatch_lock_pct:.0f}% of time.') - if miss_w > hit_w: - print(f' Dispatch miss (empty queue) dominates lock wait: miss={miss_w}us vs hit={hit_w}us.') - print(' Cache flush (dc cvac + dsb sy) is the dominant non-lock cost.') + print(f' Pop hit_rate={pop_hit_rate:.1f}%: {"low hit rate suggests ready queue often empty" if pop_hit_rate < 50 else "good hit rate"}.') + print(' Cache flush (dc cvac + dsb sy) is the dominant non-pop cost.') elif dominant_phase == 'complete': - complete_total = phase_totals.get('complete', 0) - complete_lock_pct = sum(t.get('lock_complete_wait', 0) for t in threads.values()) / complete_total * 100 if complete_total > 0 else 0 - print(f' Within complete, lock contention accounts for {complete_lock_pct:.0f}% of time.') - print(' Fanout traversal and atomic ops dominate the non-lock cost.') + total_edges = fanout_edges + fanin_edges + print(f' Fanout: avg_degree={fanout_avg:.1f}, max_degree={fanout_max}.') + print(f' Fanin: avg_degree={fanin_avg:.1f}, max_degree={fanin_max}.') + if fanin_edges > fanout_edges: + print(' Fanin traversal (release_producer + check_consumed) dominates the complete phase.') + else: + print(' Fanout traversal and atomic ops dominate the complete phase.') elif dominant_phase == 'scan': - print(' Scan phase overhead indicates too many root tasks or inefficient task graph traversal.') + print(' Scan phase overhead indicates frequent perf header updates.') print('=' * 90) return 0 diff --git a/tools/swimlane_converter.py b/tools/swimlane_converter.py index db1ec714c..4de82b6e9 100644 --- a/tools/swimlane_converter.py +++ b/tools/swimlane_converter.py @@ -137,10 +137,20 @@ def parse_sched_cpu_from_device_log(log_path, task_count): if total_sched_cpu_us <= 0: return None - return total_sched_cpu_us / task_count + total_completed = sum(t.get('completed', 0) for t in threads.values()) + if task_count > 0 and total_completed > 0 and abs(total_completed - task_count) / task_count > 0.5: + print(f"Warning: device log has {total_completed} completed tasks " + f"but perf JSON has {task_count}; skipping Sched CPU metric " + f"(device log may be from a different run)", file=sys.stderr) + return None + + return { + 'us_per_task': total_sched_cpu_us / task_count, + 'num_sched_threads': len(threads), + } -def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=None): +def print_task_statistics(tasks, func_id_to_name=None, sched_info=None): """Print task statistics grouped by func_id. Exec = kernel execution time (end_time_us - start_time_us) on AICore. @@ -152,8 +162,8 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non Args: tasks: List of task dicts func_id_to_name: Optional dict mapping func_id to function name - sched_cpu_us_per_task: Optional AICPU scheduler CPU time per task (us), - parsed from device log + sched_info: Optional dict with 'us_per_task' (float) and 'num_sched_threads' (int), + parsed from device log by parse_sched_cpu_from_device_log() """ from collections import defaultdict @@ -204,13 +214,12 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non max_finish_time = max(max_finish_time, finish_time) # Print statistics - print("\n" + "=" * 160) + print("\n" + "=" * 110) print("Task Statistics by Function") print(" Exec = kernel time on AICore; Latency = dispatch->finish (incl. head OH + Exec + tail OH)") - print("=" * 160) - print(f"{'Func_ID':<8} {'Func_Name':<12} {'Count':^6} {'Total_Exec/Latency(us)':^25} {'Avg_Exec/Latency(us)':^23} " - f"{'Min_Exec/Latency(us)':^23} {'Max_Exec/Latency(us)':^23} {'Avg_Head/Tail_OH(us)':^23} {'Exec_%':^8}") - print("-" * 160) + print("=" * 110) + print(f"{'Func_ID':<8} {'Func_Name':<12} {'Count':>5} {'Avg Exec(us)':>12} {'Avg Latency(us)':>15} {'Exec%':>6} {'Avg Head OH(us)':>15} {'Avg Tail OH(us)':>15}") + print("-" * 110) # Sort by func_id for consistent output total_count = 0 @@ -222,8 +231,6 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non count = len(durations) sum_duration = sum(durations) avg_duration = sum_duration / count - min_duration = min(durations) - max_duration = max(durations) # Accumulate totals total_count += count @@ -235,34 +242,22 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non else: func_name = f"Func_{func_id}" - # Calculate averages for new metrics + # Calculate averages avg_head_overhead = sum(stats['head_overheads']) / len(stats['head_overheads']) if stats['head_overheads'] else 0 avg_tail_overhead = sum(stats['tail_overheads']) / len(stats['tail_overheads']) if stats['tail_overheads'] else 0 avg_latency = stats['total_latency'] / count if count > 0 else 0 - min_latency = min(stats['latencies']) if stats['latencies'] else 0 - max_latency = max(stats['latencies']) if stats['latencies'] else 0 - total_latency = stats['total_latency'] # Calculate execution ratio: total_exec_time / total_latency exec_ratio = (stats['total_exec_time'] / stats['total_latency'] * 100) if stats['total_latency'] > 0 else 0 - # Format combined exec/latency values - total_combined = f"{sum_duration:.2f}/{total_latency:.2f}" - avg_combined = f"{avg_duration:.2f}/{avg_latency:.2f}" - min_combined = f"{min_duration:.2f}/{min_latency:.2f}" - max_combined = f"{max_duration:.2f}/{max_latency:.2f}" - overhead_combined = f"{avg_head_overhead:.2f}/{avg_tail_overhead:.2f}" - - print(f"{func_id:<8} {func_name:<12} {count:^6} {total_combined:^25} {avg_combined:^23} " - f"{min_combined:^23} {max_combined:^23} {overhead_combined:^23} {exec_ratio:^7.2f}%") + print(f"{func_id:<8} {func_name:<12} {count:>5} {avg_duration:>12.2f} {avg_latency:>15.2f} {exec_ratio:>5.1f}% {avg_head_overhead:>15.2f} {avg_tail_overhead:>15.2f}") # Print total row - print("-" * 160) + print("-" * 110) # Calculate total latency (sum of all latencies) total_latency_sum = sum(stats['total_latency'] for stats in func_stats.values()) - total_combined = f"{total_duration:.2f}/{total_latency_sum:.2f}" - print(f"{'TOTAL':<21} {total_count:^6} {total_combined:^25}") + print(f"{'TOTAL':<21} {total_count:>5} {total_duration:>12.2f} {total_latency_sum:>15.2f}") # Print total test execution time if min_dispatch_time != float('inf') and max_finish_time != float('-inf'): @@ -276,12 +271,19 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non exec_latency_ratio_pct = total_duration / total_latency_sum * 100 print("\n--- Task execution vs Scheduler overhead ---") print(f" Per-task (all): Avg Exec = {avg_exec_us:.2f} us, Avg Latency (dispatch->finish) = {avg_latency_us:.2f} us, Exec/Latency = {exec_latency_ratio_pct:.2f}%") - if sched_cpu_us_per_task is not None: - exec_sched_cpu_ratio = (avg_exec_us / sched_cpu_us_per_task * 100) if sched_cpu_us_per_task > 0 else 0 - print(f" Sched CPU (from device log): {sched_cpu_us_per_task:.2f} us/task (Exec/Sched_CPU = {exec_sched_cpu_ratio:.2f}%)") - print(" (Latency = dispatch to finish; Sched CPU = AICPU scheduler thread CPU time per task, from device log.)") + if sched_info is not None: + sched_cpu = sched_info['us_per_task'] + num_cores = len(set(t['core_id'] for t in tasks)) + exec_sched_ratio = (avg_exec_us / sched_cpu * 100) if sched_cpu > 0 else 0 + per_core_exec = avg_exec_us / num_cores if num_cores > 0 else 0 + per_core_ratio = (per_core_exec / sched_cpu * 100) if sched_cpu > 0 else 0 + num_threads = sched_info['num_sched_threads'] + print(f" Sched CPU (from device log): {sched_cpu:.2f} us/task (Exec/Sched = {exec_sched_ratio:.1f}%, PerCore/Sched = {per_core_ratio:.1f}%)") + print(f" (Latency = dispatch→finish; Sched CPU = scheduler thread CPU per task; PerCore = avg_exec/{num_cores}_cores vs sched_cpu, {num_threads} sched threads)") + else: + print(" (Latency = dispatch→finish; Sched CPU = scheduler thread CPU per task)") - print("=" * 160) + print("=" * 110) @@ -564,7 +566,7 @@ def generate_chrome_trace_json(tasks, output_path, func_id_to_name=None, verbose "complete": "good", # green "dispatch": "terrible", # red "scan": "thread_state_running", # blue - "early_ready": "yellow", # yellow + "idle": "yellow", # yellow } for thread_idx, thread_records in enumerate(scheduler_phases): @@ -972,14 +974,14 @@ def main(): print(f"Selection: {log_strategy}") # Optional: parse sched CPU from device log - sched_cpu_us = None + sched_info = None if resolved_device_log is not None: - sched_cpu_us = parse_sched_cpu_from_device_log(resolved_device_log, len(data['tasks'])) - if args.verbose and sched_cpu_us is not None: - print(f" Parsed sched CPU from device log: {sched_cpu_us:.2f} us/task") + sched_info = parse_sched_cpu_from_device_log(resolved_device_log, len(data['tasks'])) + if args.verbose and sched_info is not None: + print(f" Parsed sched CPU from device log: {sched_info['us_per_task']:.2f} us/task") # Print task statistics (incl. task execution vs scheduler overhead) - print_task_statistics(data['tasks'], func_names, sched_cpu_us_per_task=sched_cpu_us) + print_task_statistics(data['tasks'], func_names, sched_info=sched_info) # Integrated deep-dive overhead analysis if resolved_device_log is not None: