hw-native-sys · ChaoWao · Mar 3, 2026 · Mar 3, 2026
diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h
@@ -205,7 +205,7 @@ enum class AicpuPhaseId : uint32_t {
     SCHED_COMPLETE    = 0,  // Process completed tasks (fanout traversal)
     SCHED_DISPATCH    = 1,  // Dispatch ready tasks to idle cores
     SCHED_SCAN        = 2,  // Incremental scan for root tasks
-    SCHED_EARLY_READY = 3,  // Drain orchestrator's early-ready queue
+    SCHED_IDLE_WAIT   = 3,  // Idle/spinning (no progress)
     // Orchestrator phases (16-24)
     ORCH_SYNC      = 16,  // tensormap sync
     ORCH_ALLOC     = 17,  // task_ring_alloc

diff --git a/src/platform/src/host/performance_collector.cpp b/src/platform/src/host/performance_collector.cpp
@@ -511,7 +511,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
                     case AicpuPhaseId::SCHED_COMPLETE:    phase_name = "complete"; break;
                     case AicpuPhaseId::SCHED_DISPATCH:    phase_name = "dispatch"; break;
                     case AicpuPhaseId::SCHED_SCAN:        phase_name = "scan"; break;
-                    case AicpuPhaseId::SCHED_EARLY_READY: phase_name = "early_ready"; break;
+                    case AicpuPhaseId::SCHED_IDLE_WAIT:   phase_name = "idle"; break;
                     default: break;
                 }
 

diff --git a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
diff --git a/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md b/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md
@@ -37,18 +37,18 @@ Thread 3 loads the orchestration `.so` via `dlopen`, calls `aicpu_orchestration_
 Thread 3: Calling aicpu_orchestration_entry from SO
 aicpu_orchestration_entry ">>>>>> batch = 64"
 Thread 3: aicpu_orchestration_entry returned, cost 20943.940us
-=== Orchestrator Profiling: 16704 tasks, total=14601.580us ===
-  sync_tensormap : 286.300us (2.0%)
-  task_ring_alloc: 380.400us (2.6%)
-  param_copy     : 2147.800us (14.7%)
-  lookup+dep     : 7290.300us (49.9%)
-  heap_alloc     : 701.500us (4.8%)
-  tensormap_ins  : 1890.380us (12.9%)
-  fanin+ready    : 1207.400us (8.3%)
-  finalize+SM    : 697.500us (4.8%)
-  scope_end      : 364.080us
-  avg/task       : 0.874us
-PTO2 total submitted tasks = 16704
+Thread 3: === Orchestrator Profiling: 16704 tasks, total=14601.580us ===
+Thread 3:   sync_tensormap : 286.300us (2.0%)
+Thread 3:   task_ring_alloc: 380.400us (2.6%)
+Thread 3:   param_copy     : 2147.800us (14.7%)
+Thread 3:   lookup+dep     : 7290.300us (49.9%)
+Thread 3:   heap_alloc     : 701.500us (4.8%)
+Thread 3:   tensormap_ins  : 1890.380us (12.9%)
+Thread 3:   fanin+ready    : 1207.400us (8.3%)
+Thread 3:   finalize+SM    : 697.500us (4.8%)
+Thread 3:   scope_end      : 364.080us
+Thread 3:   avg/task       : 0.874us
+Thread 3: PTO2 total submitted tasks = 16704
 ```
 
 ### Field Reference
@@ -79,26 +79,17 @@ PTO2 total submitted tasks = 16704
 
 ## Block 2: PTO2 Scheduler Summary
 
-Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has three sub-sections: **summary**, **phase breakdown**, and **lock contention**.
+Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has two sub-sections: **summary** and **phase breakdown**.
 
-### Example (Thread 0, from the same run)
+### Example (Thread 0, from a different run: batch=1, 1044 tasks)
 
 ```
-Thread 0: === PTO2 Scheduler Summary ===
-Thread 0: completed=6068 tasks in 31398us (977 loops, 6.2 tasks/loop)
-Thread 0: --- Phase Breakdown (execution order) ---
-Thread 0:   scan:            2295us ( 7.3%)
-Thread 0:   early_ready:       77us ( 0.2%)  (deps already met at submit time)
-Thread 0:   complete:       11374us (36.2%)  [fanout: edges=7578, max_degree=20, avg=1.2]
-Thread 0:   dispatch:       17651us (56.2%)  [steal: own=4443, steal=1625, pct=26.8%]
-Thread 0: --- Lock Contention (ready_q) ---
-Thread 0:   total:         wait= 8366us hold= 4144us
-Thread 0:   scan:          wait=  318us hold=  704us
-Thread 0:   early_ready:   wait=    0us hold=    0us
-Thread 0:   complete:      wait= 1374us hold=  781us
-Thread 0:   dispatch:      wait= 6674us hold= 2659us
-Thread 0:     hit:         wait= 1361us hold=  551us (dequeued task)
-Thread 0:     miss:        wait= 5313us hold= 2108us (empty queue)
+Thread 0: completed=352 tasks in 3477.420us (147 loops, 2.4 tasks/loop)
+Thread 0: --- Phase Breakdown ---
+Thread 0:   complete:    1485.020us (42.7%)  [fanout: edges=432, max_degree=2, avg=1.2]  [fanin: edges=320, max_degree=3, avg=0.9]
+Thread 0:   scan:        14.400us (0.4%)
+Thread 0:   dispatch:    1973.060us (56.7%)  [pop: hit=352, miss=3043, hit_rate=10.4%]
+Thread 0:   idle:        4.940us (0.1%)
 ```
 
 ### Summary Line
@@ -116,63 +107,36 @@ Thread N: completed=X tasks in Yus (Z loops, W tasks/loop)
 
 ### Phase Breakdown
 
-The scheduler loop runs four phases in order each iteration. Each phase's time is accumulated across all loop iterations.
+The scheduler loop runs four phases each iteration. Each phase's time is accumulated across all loop iterations.
 
 | Phase | What it does | Inline stats |
 |-------|-------------|-------------|
-| **scan** | Scans newly submitted tasks in shared memory; enqueues root tasks (those with `fanin_count == 0`) into the ready queue | — |
-| **early_ready** | Drains the orchestrator's ready queue (tasks whose dependencies were all satisfied at submit time, detected via Step 5b in `pto2_submit_task`) | — |
-| **complete** | Polls register `COND` on each managed core; when a core becomes `IDLE`, traverses the completed task's fanout list, increments consumer refcounts, and enqueues newly ready consumers | `edges`: total fanout edges traversed; `max_degree`: largest fanout list; `avg`: average fanout per completed task |
-| **dispatch** | For each idle core, pops a task from the ready queue (own shard first, then work-stealing from other shards), builds the dispatch payload, and writes the task to the core's register | `own`: tasks dequeued from own shard; `steal`: tasks stolen from other shards; `pct`: steal percentage |
+| **complete** | Polls handshake on each managed core; when a core completes, traverses fanout list (notify consumers) and fanin list (release producers) via `on_task_complete` | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
+| **scan** | Updates the perf profiling header with latest scheduler state | — |
+| **dispatch** | For each idle core, pops a task from the ready queue via `pto2_scheduler_get_ready_task`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
+| **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — |
 
 **Interpreting phase percentages:**
 
-- **dispatch** is typically the largest (~55%) because it includes both ready-queue pops (with lock contention) and the actual register writes + payload construction.
-- **complete** is the second largest (~36%) because fanout traversal involves atomic operations (`SEQ_CST` fetch_add on refcounts) and conditional ready-queue pushes.
-- **scan** is small (~7%) — it only runs until all submitted tasks have been scanned.
-- **early_ready** is negligible in most cases.
+- **dispatch** is typically the largest (~55-60%) because it includes ready-queue pops (with spinlock), payload construction, and cache flush (`dc cvac` + `dsb sy`).
+- **complete** is the second largest (~40-45%) because it traverses both fanout (CAS-based fanin decrement, conditional ready-queue push) and fanin (release_producer, check_consumed, ring pointer advancement).
+- **scan** is small (<1%) — only updates the perf header.
+- **idle** is negligible when tasks are flowing; high idle% indicates the scheduler is starved.
 
-### Lock Contention (ready_q)
+**Interpreting pop hit_rate:**
 
-Ready queues are sharded (one shard per scheduler thread). Access is protected by per-shard spinlocks. This section reports cumulative lock **wait** (time spinning to acquire) and **hold** (time from acquire to release) for each phase.
-
-```
-Thread N:   total:         wait=Xus hold=Yus        # sum across all phases
-Thread N:   scan:          wait=Xus hold=Yus        # lock during root-task enqueue
-Thread N:   early_ready:   wait=Xus hold=Yus        # lock during orch-ready drain
-Thread N:   complete:      wait=Xus hold=Yus        # lock during fanout push
-Thread N:   dispatch:      wait=Xus hold=Yus        # lock during ready-queue pop (sum of hit+miss)
-Thread N:     hit:         wait=Xus hold=Yus        # pop attempts that dequeued a task
-Thread N:     miss:        wait=Xus hold=Yus        # pop attempts on empty queue
-```
-
-**Key observations from the example:**
-
-| Metric | Thread 0 | Thread 1 | Thread 2 |
-|--------|----------|----------|----------|
-| completed | 6068 | 5997 | 4639 |
-| total time (us) | 31398 | 31410 | 31406 |
-| dispatch % | 56.2% | 56.0% | 60.5% |
-| complete % | 36.2% | 36.7% | 35.4% |
-| steal % | 26.8% | 25.8% | 41.4% |
-| lock wait (us) | 8366 | 9176 | 8807 |
-| lock hold (us) | 4144 | 4688 | 3971 |
-
-- **Lock contention is moderate**: total wait ~8-9ms out of ~31ms total time (~27-30%).
-- **dispatch.miss dominates wait time**: most lock wait comes from polling empty queues, not actual contention. Thread 0's dispatch miss wait = 5313us vs hit wait = 1361us.
-- **Work stealing is active**: Thread 2 steals 41.4% of its tasks, indicating it finishes its own shard's tasks faster and helps drain other shards.
-- **Threads are well-balanced**: all three complete within ~31ms, despite different task counts (Thread 2 has fewer tasks but higher steal rate).
+- **High hit_rate (>50%)**: Ready queue is well-supplied; dispatch is efficient.
+- **Low hit_rate (<10%)**: Ready queue is mostly empty when cores become idle. The bottleneck is upstream (orchestrator submission speed or fanout resolution latency), not dispatch itself.
 
 ### Per-Task Averages
 
 Divide each thread's phase times by its `completed` count to get per-task scheduling cost:
 
 | Metric | Formula | Typical value |
 |--------|---------|---------------|
-| Scheduling overhead per task | total_time / completed | ~5-7 us/task |
-| Lock overhead per task | (total_wait + total_hold) / completed | ~1.5-2.5 us/task |
-| Dispatch per task | dispatch_time / completed | ~3-4 us/task |
-| Complete per task | complete_time / completed | ~2-3 us/task |
+| Scheduling overhead per task | total_time / completed | ~5-10 us/task |
+| Dispatch per task | dispatch_time / completed | ~3-6 us/task |
+| Complete per task | complete_time / completed | ~2-4 us/task |
 
 ---
 
@@ -186,7 +150,7 @@ When `--enable-profiling` is used, the host terminal prints a **Task Statistics
 | Avg scheduling overhead | `sum(thread_total) / total_tasks` (device log) | Time AICPU spends scheduling each task |
 | Sched/Exec ratio | scheduling / execution | Scheduling overhead relative to kernel execution |
 
-A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's non-lock paths (dispatch polling, fanout traversal) before reducing lock contention.
+A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's dispatch hot path (cache flush, payload construction) or upstream task flow.
 
 ---
 
@@ -196,12 +160,9 @@ A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates
 # Find the latest device log for device 2
 ls -t $HOME/ascend/log/debug/device-2/device-*.log | head -1
 
-# Extract orchestrator profiling
-grep -E "Orchestrator Profiling|sync_tensormap|task_ring_alloc|param_copy|lookup\+dep|heap_alloc|tensormap_ins|fanin\+ready|finalize\+SM|scope_end|avg/task" <logfile>
-
-# Extract scheduler summary
-grep -E "Scheduler Summary|completed=|Phase Breakdown|scan:|early_ready:|complete:|dispatch:" <logfile>
+# Extract orchestrator profiling (Thread 3)
+grep "Thread 3:" <logfile>
 
-# Extract lock contention
-grep -E "Lock Contention|total:|scan:|early_ready:|complete:|dispatch:|hit:|miss:" <logfile>
+# Extract scheduler profiling (Threads 0/1/2)
+grep -E "Thread [012]:" <logfile>
 ```
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -23,7 +23,7 @@
 // =============================================================================
 // Orchestrator Profiling (compile-time toggle)
 // =============================================================================
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
 #include "aicpu/device_time.h"
 #include "aicpu/performance_collector_aicpu.h"
 // Weak fallback for builds that don't link device_time.cpp (e.g. host).
@@ -177,7 +177,7 @@ void pto2_scope_begin(PTO2OrchestratorState* orch) {
 void pto2_scope_end(PTO2OrchestratorState* orch) {
     assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
     uint64_t _se0 = get_sys_cnt_aicpu();
 #endif
 
@@ -191,7 +191,7 @@ void pto2_scope_end(PTO2OrchestratorState* orch) {
     // Rewind the task buffer — these entries are no longer needed
     orch->scope_tasks_size = begin;
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
     uint64_t _se1 = get_sys_cnt_aicpu();
     g_orch_scope_end_cycle += (_se1 - _se0);
     perf_aicpu_record_orch_phase(AicpuPhaseId::ORCH_SCOPE_END, _se0, _se1, g_orch_submit_idx);
@@ -401,7 +401,7 @@ void pto2_submit_task(
     CYCLE_COUNT_LAP_RECORD(g_orch_finalize_cycle, AicpuPhaseId::ORCH_FINALIZE);
 
     orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
     g_orch_submit_count++;
     g_orch_submit_idx++;
 #endif
@@ -460,7 +460,7 @@ void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState* orch) {
     LOG_INFO("==================");
 }
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
 PTO2OrchProfilingData pto2_orchestrator_get_profiling() {
     PTO2OrchProfilingData d;
     d.sync_cycle = g_orch_sync_cycle;

diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -229,11 +229,11 @@ void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState* orch);
 // Orchestrator Profiling Data
 // =============================================================================
 
-#ifndef PTO2_ORCH_PROFILING
-#define PTO2_ORCH_PROFILING 1
+#ifndef PTO2_PROFILING
+#define PTO2_PROFILING 1
 #endif
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
 struct PTO2OrchProfilingData {
     uint64_t sync_cycle;
     uint64_t alloc_cycle;

diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -18,6 +18,12 @@
 #ifndef PTO_SCHEDULER_H
 #define PTO_SCHEDULER_H
 
+// PTO2_PROFILING must be defined before inline methods that use it.
+// Default ON; disable with -DPTO2_PROFILING=0.
+#ifndef PTO2_PROFILING
+#define PTO2_PROFILING 1
+#endif
+
 #include <atomic>
 
 #include "pto_runtime2_types.h"
@@ -126,6 +132,15 @@ void pto2_ready_queue_reset(PTO2ReadyQueue* queue);
 // Scheduler State
 // =============================================================================
 
+/**
+ * Statistics returned by on_task_complete
+ */
+struct PTO2CompletionStats {
+    int32_t fanout_edges;      // Number of fanout edges traversed (notify consumers)
+    int32_t tasks_enqueued;    // Number of consumers that became READY
+    int32_t fanin_edges;       // Number of fanin edges traversed (release producers)
+};
+
 /**
  * Scheduler state structure
  *
@@ -235,7 +250,7 @@ struct PTO2SchedulerState {
         check_and_handle_consumed(producer_id, producer);
     }
 
-    void release_fanin_and_check_ready(int32_t task_id,
+    bool release_fanin_and_check_ready(int32_t task_id,
                                         PTO2TaskDescriptor* task) {
         int32_t slot = pto2_task_slot(task_id);
 
@@ -249,8 +264,10 @@ struct PTO2SchedulerState {
             if (task_state[slot].compare_exchange_strong(
                     expected, PTO2_TASK_READY, std::memory_order_acq_rel, std::memory_order_acquire)) {
                 ready_queues[task->worker_type].push(task_id);
+                return true;
             }
         }
+        return false;
     }
 
     void init_task(int32_t task_id, PTO2TaskDescriptor* task) {
@@ -289,10 +306,10 @@ struct PTO2SchedulerState {
         }
     }
 
-    int32_t on_task_complete(int32_t task_id) {
+    PTO2CompletionStats on_task_complete(int32_t task_id) {
+        PTO2CompletionStats stats = {0, 0, 0};
         int32_t slot = pto2_task_slot(task_id);
         PTO2TaskDescriptor* task = pto2_sm_get_task(sm_handle, task_id);
-        int32_t fanout_notified = 0;
 
         tasks_completed.fetch_add(1, std::memory_order_relaxed);
         pto2_fanout_lock(task);
@@ -301,22 +318,31 @@ struct PTO2SchedulerState {
         pto2_fanout_unlock(task);
 
         while (current != nullptr) {
-            fanout_notified++;
             int32_t consumer_id = current->task_id;
             PTO2TaskDescriptor* consumer = pto2_sm_get_task(sm_handle, consumer_id);
+#if PTO2_PROFILING
+            stats.fanout_edges++;
+            if (release_fanin_and_check_ready(consumer_id, consumer)) {
+                stats.tasks_enqueued++;
+            }
+#else
             release_fanin_and_check_ready(consumer_id, consumer);
+#endif
             current = current->next;
         }
 
         current = task->fanin_head;
         while (current != nullptr) {
             int32_t producer_id = current->task_id;
             release_producer(producer_id);
+#if PTO2_PROFILING
+            stats.fanin_edges++;
+#endif
             current = current->next;
         }
 
         check_and_handle_consumed(task_id, task);
-        return fanout_notified;
+        return stats;
     }
 };
 

diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
@@ -25,13 +25,13 @@
 // =============================================================================
 // TensorMap Lookup Chain Length Statistics (compile-time toggle)
 // =============================================================================
-#if PTO2_ORCH_PROFILING
-static uint64_t g_lookup_chain_total = 0;
-static uint64_t g_lookup_count = 0;
-static int32_t  g_lookup_chain_max = 0;
-static uint64_t g_lookup_overlap_checks = 0;
-static uint64_t g_lookup_overlap_hits = 0;
-static uint64_t g_insert_count = 0;
+#if PTO2_TENSORMAP_PROFILING
+uint64_t g_lookup_chain_total = 0;
+uint64_t g_lookup_count = 0;
+int32_t  g_lookup_chain_max = 0;
+uint64_t g_lookup_overlap_checks = 0;
+uint64_t g_lookup_overlap_hits = 0;
+uint64_t g_insert_count = 0;
 #endif
 
 // =============================================================================
@@ -253,7 +253,7 @@ void PTO2TensorMap::sync_tensormap() {
 // =============================================================================
 // TensorMap Lookup Profiling
 // =============================================================================
-#if PTO2_ORCH_PROFILING
+#if PTO2_TENSORMAP_PROFILING
 PTO2TensorMapProfilingData pto2_tensormap_get_profiling() {
     PTO2TensorMapProfilingData d;
     d.lookup_chain_total = g_lookup_chain_total;