diff --git a/src/platform/include/common/perf_profiling.h b/src/platform/include/common/perf_profiling.h
index 596f52a16..89744cd4e 100644
--- a/src/platform/include/common/perf_profiling.h
+++ b/src/platform/include/common/perf_profiling.h
@@ -205,7 +205,7 @@ enum class AicpuPhaseId : uint32_t {
     SCHED_COMPLETE    = 0,  // Process completed tasks (fanout traversal)
     SCHED_DISPATCH    = 1,  // Dispatch ready tasks to idle cores
     SCHED_SCAN        = 2,  // Incremental scan for root tasks
-    SCHED_EARLY_READY = 3,  // Drain orchestrator's early-ready queue
+    SCHED_IDLE_WAIT   = 3,  // Idle/spinning (no progress)
     // Orchestrator phases (16-24)
     ORCH_SYNC      = 16,  // tensormap sync
     ORCH_ALLOC     = 17,  // task_ring_alloc
diff --git a/src/platform/src/host/performance_collector.cpp b/src/platform/src/host/performance_collector.cpp
index ea7a86cab..4b49210dd 100644
--- a/src/platform/src/host/performance_collector.cpp
+++ b/src/platform/src/host/performance_collector.cpp
@@ -511,7 +511,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
                     case AicpuPhaseId::SCHED_COMPLETE:    phase_name = "complete"; break;
                     case AicpuPhaseId::SCHED_DISPATCH:    phase_name = "dispatch"; break;
                     case AicpuPhaseId::SCHED_SCAN:        phase_name = "scan"; break;
-                    case AicpuPhaseId::SCHED_EARLY_READY: phase_name = "early_ready"; break;
+                    case AicpuPhaseId::SCHED_IDLE_WAIT:   phase_name = "idle"; break;
                     default: break;
                 }
 
diff --git a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 266c44500..063260e73 100644
--- a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -38,11 +38,11 @@
 #include "aicpu/platform_regs.h"
 
 // Scheduler profiling helper
-#ifndef PTO2_ORCH_PROFILING
-#define PTO2_ORCH_PROFILING 1
+#ifndef PTO2_PROFILING
+#define PTO2_PROFILING 1
 #endif
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
 // Accumulated nanoseconds per sub-step
 #define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
 #define CYCLE_COUNT_LAP(acc) do { _t1 = get_sys_cnt_aicpu(); acc += (_t1 - _t0); _t0 = _t1; } while(0)
@@ -473,21 +473,31 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
     int32_t last_reported_task_count = 0;
 
     // Scheduler profiling counters
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
     uint64_t sched_scan_cycle = 0;
-    uint64_t sched_orch_drain_cycle = 0;
     uint64_t sched_complete_cycle = 0;
     uint64_t sched_dispatch_cycle = 0;
-    uint64_t sched_yield_cycle = 0;
+    uint64_t sched_idle_cycle = 0;
     uint64_t sched_loop_count = 0;
-    uint64_t sched_yield_count = 0;
+    uint64_t notify_edges_total = 0;
+    int32_t  notify_max_degree = 0;
+    uint64_t notify_tasks_enqueued = 0;
+    uint64_t fanin_edges_total = 0;
+    int32_t  fanin_max_degree = 0;
+    uint64_t pop_hit = 0;
+    uint64_t pop_miss = 0;
+    uint32_t phase_complete_count = 0;
+    uint32_t phase_dispatch_count = 0;
 #endif
 
     while (true) {
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
         sched_loop_count++;
 #endif
         CYCLE_COUNT_START();
+#if PTO2_PROFILING
+        uint64_t _t0_phase = _t0;
+#endif
         // Dynamic task_count (Thread 3 sets total_tasks_ when orchestration completes)
         int32_t task_count = total_tasks_.load(std::memory_order_acquire);
         bool orch_done = orchestrator_done_.load(std::memory_order_acquire);
@@ -531,7 +541,15 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
 
                 PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
                 int32_t task_id = executing_task_ids_[core_id];
-                rt->scheduler.on_task_complete(task_id);
+                PTO2CompletionStats cstats = rt->scheduler.on_task_complete(task_id);
+#if PTO2_PROFILING
+                notify_edges_total += cstats.fanout_edges;
+                if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
+                notify_tasks_enqueued += cstats.tasks_enqueued;
+                fanin_edges_total += cstats.fanin_edges;
+                if (cstats.fanin_edges > fanin_max_degree) fanin_max_degree = cstats.fanin_edges;
+                phase_complete_count++;
+#endif
                 executing_task_ids_[core_id] = AICPU_TASK_INVALID;
 
                 // Write AICPU dispatch/finish timestamps into the PerfRecord
@@ -568,6 +586,14 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
             }
         }
         CYCLE_COUNT_LAP(sched_complete_cycle);
+#if PTO2_PROFILING
+        if (profiling_enabled && phase_complete_count > 0) {
+            perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_COMPLETE,
+                                    _t0_phase, _t1, sched_loop_count, phase_complete_count);
+            _t0_phase = _t1;
+            phase_complete_count = 0;
+        }
+#endif
 
         // Phase 2: Dispatch ready tasks to idle cores (register-based dispatch)
         if (cur_thread_tasks_in_flight < core_num) {
@@ -581,6 +607,10 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                     PTO2WorkerType wt = (h->core_type == CoreType::AIC) ? PTO2_WORKER_CUBE : PTO2_WORKER_VECTOR;
                     int32_t task_id = rt->scheduler.get_ready_task(wt);
                     if (task_id >= 0) {
+#if PTO2_PROFILING
+                        pop_hit++;
+                        phase_dispatch_count++;
+#endif
                         PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask];
                         PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
                         build_pto2_payload(payload, runtime, task, task_descriptors, dep_list_pool, window_size);
@@ -598,11 +628,23 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                         cur_thread_tasks_in_flight++;
                         made_progress = true;
                         DEV_DEBUG("Thread %d: Dispatching PTO2 task %d to core %d", thread_idx, task_id, core_id);
+                    } else {
+#if PTO2_PROFILING
+                        pop_miss++;
+#endif
                     }
                 }
             }
         }
         CYCLE_COUNT_LAP(sched_dispatch_cycle);
+#if PTO2_PROFILING
+        if (profiling_enabled && phase_dispatch_count > 0) {
+            perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_DISPATCH,
+                                    _t0_phase, _t1, sched_loop_count, phase_dispatch_count);
+            _t0_phase = _t1;
+            phase_dispatch_count = 0;
+        }
+#endif
 
         // Update perf header total_tasks if visible tasks have changed
         {
@@ -617,6 +659,13 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
             }
         }
         CYCLE_COUNT_LAP(sched_scan_cycle);
+#if PTO2_PROFILING
+        if (profiling_enabled) {
+            perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_SCAN,
+                                    _t0_phase, _t1, sched_loop_count, 0);
+            _t0_phase = _t1;
+        }
+#endif
 
         if (!made_progress) {
             idle_iterations++;
@@ -680,44 +729,64 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                 SPIN_WAIT_HINT();
             }
             PTO2_SPIN_PAUSE_LIGHT();
-#if PTO2_ORCH_PROFILING
-            sched_yield_count++;
+            CYCLE_COUNT_LAP(sched_idle_cycle);
+#if PTO2_PROFILING
+            if (profiling_enabled) {
+                perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT,
+                                        _t0_phase, _t1, sched_loop_count, 0);
+                _t0_phase = _t1;
+            }
 #endif
-            CYCLE_COUNT_LAP(sched_yield_cycle);
         } else {
             idle_iterations = 0;
         }
     }
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
+    if (profiling_enabled) {
     uint64_t sched_total =
-        sched_scan_cycle + sched_orch_drain_cycle + sched_complete_cycle + sched_dispatch_cycle + sched_yield_cycle;
+        sched_complete_cycle + sched_scan_cycle + sched_dispatch_cycle + sched_idle_cycle;
     if (sched_total == 0) sched_total = 1;  // avoid div-by-zero
-    DEV_ALWAYS("Thread %d: PTO2 scheduler stats: loops=%llu, completed=%d, total=%.3fus",
+    double tasks_per_loop = sched_loop_count > 0 ? (double)cur_thread_completed / sched_loop_count : 0.0;
+    DEV_ALWAYS("Thread %d: completed=%d tasks in %.3fus (%llu loops, %.1f tasks/loop)",
         thread_idx,
-        (unsigned long long)sched_loop_count,
         cur_thread_completed,
-        cycles_to_us(sched_total));
-    DEV_ALWAYS(
-        "Thread %d:   scan=%.3fus (%.1f%%), orch_drain=%.3fus (%.1f%%), complete=%.3fus (%.1f%%), dispatch=%.3fus "
-        "(%.1f%%)",
+        cycles_to_us(sched_total),
+        (unsigned long long)sched_loop_count,
+        tasks_per_loop);
+    DEV_ALWAYS("Thread %d: --- Phase Breakdown ---", thread_idx);
+    double notify_avg = cur_thread_completed > 0
+        ? (double)notify_edges_total / cur_thread_completed : 0.0;
+    double fanin_avg = cur_thread_completed > 0
+        ? (double)fanin_edges_total / cur_thread_completed : 0.0;
+    DEV_ALWAYS("Thread %d:   complete:    %.3fus (%.1f%%)  [fanout: edges=%llu, max_degree=%d, avg=%.1f]  [fanin: edges=%llu, max_degree=%d, avg=%.1f]",
         thread_idx,
-        cycles_to_us(sched_scan_cycle),
-        sched_scan_cycle * 100.0 / sched_total,
-        cycles_to_us(sched_orch_drain_cycle),
-        sched_orch_drain_cycle * 100.0 / sched_total,
         cycles_to_us(sched_complete_cycle),
         sched_complete_cycle * 100.0 / sched_total,
+        (unsigned long long)notify_edges_total,
+        notify_max_degree,
+        notify_avg,
+        (unsigned long long)fanin_edges_total,
+        fanin_max_degree,
+        fanin_avg);
+    DEV_ALWAYS("Thread %d:   scan:        %.3fus (%.1f%%)",
+        thread_idx,
+        cycles_to_us(sched_scan_cycle),
+        sched_scan_cycle * 100.0 / sched_total);
+    uint64_t pop_total = pop_hit + pop_miss;
+    double pop_hit_rate = pop_total > 0 ? pop_hit * 100.0 / pop_total : 0.0;
+    DEV_ALWAYS("Thread %d:   dispatch:    %.3fus (%.1f%%)  [pop: hit=%llu, miss=%llu, hit_rate=%.1f%%]",
+        thread_idx,
         cycles_to_us(sched_dispatch_cycle),
-        sched_dispatch_cycle * 100.0 / sched_total);
-    DEV_ALWAYS("Thread %d:   yield=%.3fus (%.1f%%, %llu calls, avg=%.1fus)",
+        sched_dispatch_cycle * 100.0 / sched_total,
+        (unsigned long long)pop_hit,
+        (unsigned long long)pop_miss,
+        pop_hit_rate);
+    DEV_ALWAYS("Thread %d:   idle:        %.3fus (%.1f%%)",
         thread_idx,
-        cycles_to_us(sched_yield_cycle),
-        sched_yield_cycle * 100.0 / sched_total,
-        (unsigned long long)sched_yield_count,
-        sched_yield_count > 0 ? cycles_to_us(sched_yield_cycle) / sched_yield_count : 0.0);
-
-    DEV_ALWAYS("Thread %d: PTO2 execution complete, completed %d tasks", thread_idx, cur_thread_completed);
+        cycles_to_us(sched_idle_cycle),
+        sched_idle_cycle * 100.0 / sched_total);
+    }
 #endif
 
     // Flush performance buffers for cores managed by this thread
@@ -911,54 +980,56 @@ int AicpuExecutor::run(Runtime* runtime) {
                 cycles_to_us(orch_cycle_end - orch_cycle_start));
 
             // Print orchestrator profiling data
-#if PTO2_ORCH_PROFILING
-            {
+#if PTO2_PROFILING
+            if (runtime->enable_profiling) {
                 PTO2OrchProfilingData p = pto2_orchestrator_get_profiling();
                 uint64_t total = p.sync_cycle + p.alloc_cycle + p.params_cycle +
                                  p.lookup_cycle + p.heap_cycle + p.insert_cycle +
                                  p.fanin_cycle + p.finalize_cycle;
-                DEV_ALWAYS("=== Orchestrator Profiling: %lld tasks, total=%.3fus ===",
+                if (total == 0) total = 1;  // avoid div-by-zero
+                DEV_ALWAYS("Thread 3: === Orchestrator Profiling: %lld tasks, total=%.3fus ===",
                          (long long)p.submit_count, cycles_to_us(total));
-                DEV_ALWAYS("  sync_tensormap : %.3fus (%.1f%%)", cycles_to_us(p.sync_cycle), p.sync_cycle * 100.0 / total);
-                DEV_ALWAYS("  task_ring_alloc: %.3fus (%.1f%%)", cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total);
-                DEV_ALWAYS("  param_copy     : %.3fus (%.1f%%)", cycles_to_us(p.params_cycle), p.params_cycle * 100.0 / total);
-                DEV_ALWAYS("  lookup+dep     : %.3fus (%.1f%%)", cycles_to_us(p.lookup_cycle), p.lookup_cycle * 100.0 / total);
-                DEV_ALWAYS("  heap_alloc     : %.3fus (%.1f%%)", cycles_to_us(p.heap_cycle), p.heap_cycle * 100.0 / total);
-                DEV_ALWAYS("  tensormap_ins  : %.3fus (%.1f%%)", cycles_to_us(p.insert_cycle), p.insert_cycle * 100.0 / total);
-                DEV_ALWAYS("  fanin+ready    : %.3fus (%.1f%%)", cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total);
-                DEV_ALWAYS("  finalize+SM    : %.3fus (%.1f%%)", cycles_to_us(p.finalize_cycle), p.finalize_cycle * 100.0 / total);
-                DEV_ALWAYS("  scope_end      : %.3fus", cycles_to_us(p.scope_end_cycle));
-                DEV_ALWAYS("  avg/task       : %.3fus", cycles_to_us(total) / p.submit_count);
-
+                DEV_ALWAYS("Thread 3:   sync_tensormap : %.3fus (%.1f%%)", cycles_to_us(p.sync_cycle), p.sync_cycle * 100.0 / total);
+                DEV_ALWAYS("Thread 3:   task_ring_alloc: %.3fus (%.1f%%)", cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total);
+                DEV_ALWAYS("Thread 3:   param_copy     : %.3fus (%.1f%%)", cycles_to_us(p.params_cycle), p.params_cycle * 100.0 / total);
+                DEV_ALWAYS("Thread 3:   lookup+dep     : %.3fus (%.1f%%)", cycles_to_us(p.lookup_cycle), p.lookup_cycle * 100.0 / total);
+                DEV_ALWAYS("Thread 3:   heap_alloc     : %.3fus (%.1f%%)", cycles_to_us(p.heap_cycle), p.heap_cycle * 100.0 / total);
+                DEV_ALWAYS("Thread 3:   tensormap_ins  : %.3fus (%.1f%%)", cycles_to_us(p.insert_cycle), p.insert_cycle * 100.0 / total);
+                DEV_ALWAYS("Thread 3:   fanin+ready    : %.3fus (%.1f%%)", cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total);
+                DEV_ALWAYS("Thread 3:   finalize+SM    : %.3fus (%.1f%%)", cycles_to_us(p.finalize_cycle), p.finalize_cycle * 100.0 / total);
+                DEV_ALWAYS("Thread 3:   scope_end      : %.3fus", cycles_to_us(p.scope_end_cycle));
+                DEV_ALWAYS("Thread 3:   avg/task       : %.3fus",
+                    p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0);
+
+#if PTO2_TENSORMAP_PROFILING
                 PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
-                DEV_ALWAYS("=== TensorMap Lookup Stats ===");
-                DEV_ALWAYS("  lookups        : %llu, inserts: %llu",
+                DEV_ALWAYS("Thread 3: === TensorMap Lookup Stats ===");
+                DEV_ALWAYS("Thread 3:   lookups        : %llu, inserts: %llu",
                     (unsigned long long)tp.lookup_count, (unsigned long long)tp.insert_count);
-                DEV_ALWAYS("  chain walked   : total=%llu, avg=%.1f, max=%d",
+                DEV_ALWAYS("Thread 3:   chain walked   : total=%llu, avg=%.1f, max=%d",
                     (unsigned long long)tp.lookup_chain_total,
                     tp.lookup_count > 0 ? (double)tp.lookup_chain_total / tp.lookup_count : 0.0,
                     tp.lookup_chain_max);
-                DEV_ALWAYS("  overlap checks : %llu, hits=%llu (%.1f%%)",
+                DEV_ALWAYS("Thread 3:   overlap checks : %llu, hits=%llu (%.1f%%)",
                     (unsigned long long)tp.overlap_checks, (unsigned long long)tp.overlap_hits,
                     tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0);
+#endif
 
                 // Write orchestrator summary to shared memory for host-side export
-                if (runtime->enable_profiling) {
-                    AicpuOrchSummary orch_summary = {};
-                    orch_summary.start_time = orch_cycle_start;
-                    orch_summary.end_time = orch_cycle_end;
-                    orch_summary.sync_cycle = p.sync_cycle;
-                    orch_summary.alloc_cycle = p.alloc_cycle;
-                    orch_summary.params_cycle = p.params_cycle;
-                    orch_summary.lookup_cycle = p.lookup_cycle;
-                    orch_summary.heap_cycle = p.heap_cycle;
-                    orch_summary.insert_cycle = p.insert_cycle;
-                    orch_summary.fanin_cycle = p.fanin_cycle;
-                    orch_summary.finalize_cycle = p.finalize_cycle;
-                    orch_summary.scope_end_cycle = p.scope_end_cycle;
-                    orch_summary.submit_count = p.submit_count;
-                    perf_aicpu_write_orch_summary(&orch_summary);
-                }
+                AicpuOrchSummary orch_summary = {};
+                orch_summary.start_time = orch_cycle_start;
+                orch_summary.end_time = orch_cycle_end;
+                orch_summary.sync_cycle = p.sync_cycle;
+                orch_summary.alloc_cycle = p.alloc_cycle;
+                orch_summary.params_cycle = p.params_cycle;
+                orch_summary.lookup_cycle = p.lookup_cycle;
+                orch_summary.heap_cycle = p.heap_cycle;
+                orch_summary.insert_cycle = p.insert_cycle;
+                orch_summary.fanin_cycle = p.fanin_cycle;
+                orch_summary.finalize_cycle = p.finalize_cycle;
+                orch_summary.scope_end_cycle = p.scope_end_cycle;
+                orch_summary.submit_count = p.submit_count;
+                perf_aicpu_write_orch_summary(&orch_summary);
             }
 #endif
 
@@ -975,7 +1046,7 @@ int AicpuExecutor::run(Runtime* runtime) {
             PTO2SharedMemoryHeader* sm_header = static_cast<PTO2SharedMemoryHeader*>(sm);
             int32_t pto2_task_count =
                 sm_header ? sm_header->current_task_index.load(std::memory_order_acquire) : 0;
-            DEV_ALWAYS("PTO2 total submitted tasks = %d", pto2_task_count);
+            DEV_ALWAYS("Thread 3: PTO2 total submitted tasks = %d", pto2_task_count);
             total_tasks_.store(pto2_task_count, std::memory_order_release);
             orchestrator_done_.store(true, std::memory_order_release);
             DEV_INFO("Thread 3: Set orchestrator_done=true, waiting for scheduler threads");
diff --git a/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md b/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md
index f18402f5b..4d8d8dc11 100644
--- a/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md
+++ b/src/runtime/tensormap_and_ringbuffer/doc/device_log_profiling.md
@@ -37,18 +37,18 @@ Thread 3 loads the orchestration `.so` via `dlopen`, calls `aicpu_orchestration_
 Thread 3: Calling aicpu_orchestration_entry from SO
 aicpu_orchestration_entry ">>>>>> batch = 64"
 Thread 3: aicpu_orchestration_entry returned, cost 20943.940us
-=== Orchestrator Profiling: 16704 tasks, total=14601.580us ===
-  sync_tensormap : 286.300us (2.0%)
-  task_ring_alloc: 380.400us (2.6%)
-  param_copy     : 2147.800us (14.7%)
-  lookup+dep     : 7290.300us (49.9%)
-  heap_alloc     : 701.500us (4.8%)
-  tensormap_ins  : 1890.380us (12.9%)
-  fanin+ready    : 1207.400us (8.3%)
-  finalize+SM    : 697.500us (4.8%)
-  scope_end      : 364.080us
-  avg/task       : 0.874us
-PTO2 total submitted tasks = 16704
+Thread 3: === Orchestrator Profiling: 16704 tasks, total=14601.580us ===
+Thread 3:   sync_tensormap : 286.300us (2.0%)
+Thread 3:   task_ring_alloc: 380.400us (2.6%)
+Thread 3:   param_copy     : 2147.800us (14.7%)
+Thread 3:   lookup+dep     : 7290.300us (49.9%)
+Thread 3:   heap_alloc     : 701.500us (4.8%)
+Thread 3:   tensormap_ins  : 1890.380us (12.9%)
+Thread 3:   fanin+ready    : 1207.400us (8.3%)
+Thread 3:   finalize+SM    : 697.500us (4.8%)
+Thread 3:   scope_end      : 364.080us
+Thread 3:   avg/task       : 0.874us
+Thread 3: PTO2 total submitted tasks = 16704
 ```
 
 ### Field Reference
@@ -79,26 +79,17 @@ PTO2 total submitted tasks = 16704
 
 ## Block 2: PTO2 Scheduler Summary
 
-Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has three sub-sections: **summary**, **phase breakdown**, and **lock contention**.
+Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has two sub-sections: **summary** and **phase breakdown**.
 
-### Example (Thread 0, from the same run)
+### Example (Thread 0, from a different run: batch=1, 1044 tasks)
 
 ```
-Thread 0: === PTO2 Scheduler Summary ===
-Thread 0: completed=6068 tasks in 31398us (977 loops, 6.2 tasks/loop)
-Thread 0: --- Phase Breakdown (execution order) ---
-Thread 0:   scan:            2295us ( 7.3%)
-Thread 0:   early_ready:       77us ( 0.2%)  (deps already met at submit time)
-Thread 0:   complete:       11374us (36.2%)  [fanout: edges=7578, max_degree=20, avg=1.2]
-Thread 0:   dispatch:       17651us (56.2%)  [steal: own=4443, steal=1625, pct=26.8%]
-Thread 0: --- Lock Contention (ready_q) ---
-Thread 0:   total:         wait= 8366us hold= 4144us
-Thread 0:   scan:          wait=  318us hold=  704us
-Thread 0:   early_ready:   wait=    0us hold=    0us
-Thread 0:   complete:      wait= 1374us hold=  781us
-Thread 0:   dispatch:      wait= 6674us hold= 2659us
-Thread 0:     hit:         wait= 1361us hold=  551us (dequeued task)
-Thread 0:     miss:        wait= 5313us hold= 2108us (empty queue)
+Thread 0: completed=352 tasks in 3477.420us (147 loops, 2.4 tasks/loop)
+Thread 0: --- Phase Breakdown ---
+Thread 0:   complete:    1485.020us (42.7%)  [fanout: edges=432, max_degree=2, avg=1.2]  [fanin: edges=320, max_degree=3, avg=0.9]
+Thread 0:   scan:        14.400us (0.4%)
+Thread 0:   dispatch:    1973.060us (56.7%)  [pop: hit=352, miss=3043, hit_rate=10.4%]
+Thread 0:   idle:        4.940us (0.1%)
 ```
 
 ### Summary Line
@@ -116,52 +107,26 @@ Thread N: completed=X tasks in Yus (Z loops, W tasks/loop)
 
 ### Phase Breakdown
 
-The scheduler loop runs four phases in order each iteration. Each phase's time is accumulated across all loop iterations.
+The scheduler loop runs four phases each iteration. Each phase's time is accumulated across all loop iterations.
 
 | Phase | What it does | Inline stats |
 |-------|-------------|-------------|
-| **scan** | Scans newly submitted tasks in shared memory; enqueues root tasks (those with `fanin_count == 0`) into the ready queue | — |
-| **early_ready** | Drains the orchestrator's ready queue (tasks whose dependencies were all satisfied at submit time, detected via Step 5b in `pto2_submit_task`) | — |
-| **complete** | Polls register `COND` on each managed core; when a core becomes `IDLE`, traverses the completed task's fanout list, increments consumer refcounts, and enqueues newly ready consumers | `edges`: total fanout edges traversed; `max_degree`: largest fanout list; `avg`: average fanout per completed task |
-| **dispatch** | For each idle core, pops a task from the ready queue (own shard first, then work-stealing from other shards), builds the dispatch payload, and writes the task to the core's register | `own`: tasks dequeued from own shard; `steal`: tasks stolen from other shards; `pct`: steal percentage |
+| **complete** | Polls handshake on each managed core; when a core completes, traverses fanout list (notify consumers) and fanin list (release producers) via `on_task_complete` | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
+| **scan** | Updates the perf profiling header with latest scheduler state | — |
+| **dispatch** | For each idle core, pops a task from the ready queue via `pto2_scheduler_get_ready_task`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
+| **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — |
 
 **Interpreting phase percentages:**
 
-- **dispatch** is typically the largest (~55%) because it includes both ready-queue pops (with lock contention) and the actual register writes + payload construction.
-- **complete** is the second largest (~36%) because fanout traversal involves atomic operations (`SEQ_CST` fetch_add on refcounts) and conditional ready-queue pushes.
-- **scan** is small (~7%) — it only runs until all submitted tasks have been scanned.
-- **early_ready** is negligible in most cases.
+- **dispatch** is typically the largest (~55-60%) because it includes ready-queue pops (with spinlock), payload construction, and cache flush (`dc cvac` + `dsb sy`).
+- **complete** is the second largest (~40-45%) because it traverses both fanout (CAS-based fanin decrement, conditional ready-queue push) and fanin (release_producer, check_consumed, ring pointer advancement).
+- **scan** is small (<1%) — only updates the perf header.
+- **idle** is negligible when tasks are flowing; high idle% indicates the scheduler is starved.
 
-### Lock Contention (ready_q)
+**Interpreting pop hit_rate:**
 
-Ready queues are sharded (one shard per scheduler thread). Access is protected by per-shard spinlocks. This section reports cumulative lock **wait** (time spinning to acquire) and **hold** (time from acquire to release) for each phase.
-
-```
-Thread N:   total:         wait=Xus hold=Yus        # sum across all phases
-Thread N:   scan:          wait=Xus hold=Yus        # lock during root-task enqueue
-Thread N:   early_ready:   wait=Xus hold=Yus        # lock during orch-ready drain
-Thread N:   complete:      wait=Xus hold=Yus        # lock during fanout push
-Thread N:   dispatch:      wait=Xus hold=Yus        # lock during ready-queue pop (sum of hit+miss)
-Thread N:     hit:         wait=Xus hold=Yus        # pop attempts that dequeued a task
-Thread N:     miss:        wait=Xus hold=Yus        # pop attempts on empty queue
-```
-
-**Key observations from the example:**
-
-| Metric | Thread 0 | Thread 1 | Thread 2 |
-|--------|----------|----------|----------|
-| completed | 6068 | 5997 | 4639 |
-| total time (us) | 31398 | 31410 | 31406 |
-| dispatch % | 56.2% | 56.0% | 60.5% |
-| complete % | 36.2% | 36.7% | 35.4% |
-| steal % | 26.8% | 25.8% | 41.4% |
-| lock wait (us) | 8366 | 9176 | 8807 |
-| lock hold (us) | 4144 | 4688 | 3971 |
-
-- **Lock contention is moderate**: total wait ~8-9ms out of ~31ms total time (~27-30%).
-- **dispatch.miss dominates wait time**: most lock wait comes from polling empty queues, not actual contention. Thread 0's dispatch miss wait = 5313us vs hit wait = 1361us.
-- **Work stealing is active**: Thread 2 steals 41.4% of its tasks, indicating it finishes its own shard's tasks faster and helps drain other shards.
-- **Threads are well-balanced**: all three complete within ~31ms, despite different task counts (Thread 2 has fewer tasks but higher steal rate).
+- **High hit_rate (>50%)**: Ready queue is well-supplied; dispatch is efficient.
+- **Low hit_rate (<10%)**: Ready queue is mostly empty when cores become idle. The bottleneck is upstream (orchestrator submission speed or fanout resolution latency), not dispatch itself.
 
 ### Per-Task Averages
 
@@ -169,10 +134,9 @@ Divide each thread's phase times by its `completed` count to get per-task schedu
 
 | Metric | Formula | Typical value |
 |--------|---------|---------------|
-| Scheduling overhead per task | total_time / completed | ~5-7 us/task |
-| Lock overhead per task | (total_wait + total_hold) / completed | ~1.5-2.5 us/task |
-| Dispatch per task | dispatch_time / completed | ~3-4 us/task |
-| Complete per task | complete_time / completed | ~2-3 us/task |
+| Scheduling overhead per task | total_time / completed | ~5-10 us/task |
+| Dispatch per task | dispatch_time / completed | ~3-6 us/task |
+| Complete per task | complete_time / completed | ~2-4 us/task |
 
 ---
 
@@ -186,7 +150,7 @@ When `--enable-profiling` is used, the host terminal prints a **Task Statistics
 | Avg scheduling overhead | `sum(thread_total) / total_tasks` (device log) | Time AICPU spends scheduling each task |
 | Sched/Exec ratio | scheduling / execution | Scheduling overhead relative to kernel execution |
 
-A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's non-lock paths (dispatch polling, fanout traversal) before reducing lock contention.
+A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's dispatch hot path (cache flush, payload construction) or upstream task flow.
 
 ---
 
@@ -196,12 +160,9 @@ A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates
 # Find the latest device log for device 2
 ls -t $HOME/ascend/log/debug/device-2/device-*.log | head -1
 
-# Extract orchestrator profiling
-grep -E "Orchestrator Profiling|sync_tensormap|task_ring_alloc|param_copy|lookup\+dep|heap_alloc|tensormap_ins|fanin\+ready|finalize\+SM|scope_end|avg/task" <logfile>
-
-# Extract scheduler summary
-grep -E "Scheduler Summary|completed=|Phase Breakdown|scan:|early_ready:|complete:|dispatch:" <logfile>
+# Extract orchestrator profiling (Thread 3)
+grep "Thread 3:" <logfile>
 
-# Extract lock contention
-grep -E "Lock Contention|total:|scan:|early_ready:|complete:|dispatch:|hit:|miss:" <logfile>
+# Extract scheduler profiling (Threads 0/1/2)
+grep -E "Thread [012]:" <logfile>
 ```
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 294e7c709..2e8964eb3 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -23,7 +23,7 @@
 // =============================================================================
 // Orchestrator Profiling (compile-time toggle)
 // =============================================================================
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
 #include "aicpu/device_time.h"
 #include "aicpu/performance_collector_aicpu.h"
 // Weak fallback for builds that don't link device_time.cpp (e.g. host).
@@ -177,7 +177,7 @@ void pto2_scope_begin(PTO2OrchestratorState* orch) {
 void pto2_scope_end(PTO2OrchestratorState* orch) {
     assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
     uint64_t _se0 = get_sys_cnt_aicpu();
 #endif
 
@@ -191,7 +191,7 @@ void pto2_scope_end(PTO2OrchestratorState* orch) {
     // Rewind the task buffer — these entries are no longer needed
     orch->scope_tasks_size = begin;
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
     uint64_t _se1 = get_sys_cnt_aicpu();
     g_orch_scope_end_cycle += (_se1 - _se0);
     perf_aicpu_record_orch_phase(AicpuPhaseId::ORCH_SCOPE_END, _se0, _se1, g_orch_submit_idx);
@@ -401,7 +401,7 @@ void pto2_submit_task(
     CYCLE_COUNT_LAP_RECORD(g_orch_finalize_cycle, AicpuPhaseId::ORCH_FINALIZE);
 
     orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
     g_orch_submit_count++;
     g_orch_submit_idx++;
 #endif
@@ -460,7 +460,7 @@ void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState* orch) {
     LOG_INFO("==================");
 }
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
 PTO2OrchProfilingData pto2_orchestrator_get_profiling() {
     PTO2OrchProfilingData d;
     d.sync_cycle = g_orch_sync_cycle;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index e4ce51a40..c63e12a98 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -229,11 +229,11 @@ void pto2_orchestrator_print_scope_stack(PTO2OrchestratorState* orch);
 // Orchestrator Profiling Data
 // =============================================================================
 
-#ifndef PTO2_ORCH_PROFILING
-#define PTO2_ORCH_PROFILING 1
+#ifndef PTO2_PROFILING
+#define PTO2_PROFILING 1
 #endif
 
-#if PTO2_ORCH_PROFILING
+#if PTO2_PROFILING
 struct PTO2OrchProfilingData {
     uint64_t sync_cycle;
     uint64_t alloc_cycle;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index 0e4b7bdd7..b88c21f65 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -18,6 +18,12 @@
 #ifndef PTO_SCHEDULER_H
 #define PTO_SCHEDULER_H
 
+// PTO2_PROFILING must be defined before inline methods that use it.
+// Default ON; disable with -DPTO2_PROFILING=0.
+#ifndef PTO2_PROFILING
+#define PTO2_PROFILING 1
+#endif
+
 #include <atomic>
 
 #include "pto_runtime2_types.h"
@@ -126,6 +132,15 @@ void pto2_ready_queue_reset(PTO2ReadyQueue* queue);
 // Scheduler State
 // =============================================================================
 
+/**
+ * Statistics returned by on_task_complete
+ */
+struct PTO2CompletionStats {
+    int32_t fanout_edges;      // Number of fanout edges traversed (notify consumers)
+    int32_t tasks_enqueued;    // Number of consumers that became READY
+    int32_t fanin_edges;       // Number of fanin edges traversed (release producers)
+};
+
 /**
  * Scheduler state structure
  *
@@ -235,7 +250,7 @@ struct PTO2SchedulerState {
         check_and_handle_consumed(producer_id, producer);
     }
 
-    void release_fanin_and_check_ready(int32_t task_id,
+    bool release_fanin_and_check_ready(int32_t task_id,
                                         PTO2TaskDescriptor* task) {
         int32_t slot = pto2_task_slot(task_id);
 
@@ -249,8 +264,10 @@ struct PTO2SchedulerState {
             if (task_state[slot].compare_exchange_strong(
                     expected, PTO2_TASK_READY, std::memory_order_acq_rel, std::memory_order_acquire)) {
                 ready_queues[task->worker_type].push(task_id);
+                return true;
             }
         }
+        return false;
     }
 
     void init_task(int32_t task_id, PTO2TaskDescriptor* task) {
@@ -289,10 +306,10 @@ struct PTO2SchedulerState {
         }
     }
 
-    int32_t on_task_complete(int32_t task_id) {
+    PTO2CompletionStats on_task_complete(int32_t task_id) {
+        PTO2CompletionStats stats = {0, 0, 0};
         int32_t slot = pto2_task_slot(task_id);
         PTO2TaskDescriptor* task = pto2_sm_get_task(sm_handle, task_id);
-        int32_t fanout_notified = 0;
 
         tasks_completed.fetch_add(1, std::memory_order_relaxed);
         pto2_fanout_lock(task);
@@ -301,10 +318,16 @@ struct PTO2SchedulerState {
         pto2_fanout_unlock(task);
 
         while (current != nullptr) {
-            fanout_notified++;
             int32_t consumer_id = current->task_id;
             PTO2TaskDescriptor* consumer = pto2_sm_get_task(sm_handle, consumer_id);
+#if PTO2_PROFILING
+            stats.fanout_edges++;
+            if (release_fanin_and_check_ready(consumer_id, consumer)) {
+                stats.tasks_enqueued++;
+            }
+#else
             release_fanin_and_check_ready(consumer_id, consumer);
+#endif
             current = current->next;
         }
 
@@ -312,11 +335,14 @@ struct PTO2SchedulerState {
         while (current != nullptr) {
             int32_t producer_id = current->task_id;
             release_producer(producer_id);
+#if PTO2_PROFILING
+            stats.fanin_edges++;
+#endif
             current = current->next;
         }
 
         check_and_handle_consumed(task_id, task);
-        return fanout_notified;
+        return stats;
     }
 };
 
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
index c69aa73fc..5e22297cc 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
@@ -25,13 +25,13 @@
 // =============================================================================
 // TensorMap Lookup Chain Length Statistics (compile-time toggle)
 // =============================================================================
-#if PTO2_ORCH_PROFILING
-static uint64_t g_lookup_chain_total = 0;
-static uint64_t g_lookup_count = 0;
-static int32_t  g_lookup_chain_max = 0;
-static uint64_t g_lookup_overlap_checks = 0;
-static uint64_t g_lookup_overlap_hits = 0;
-static uint64_t g_insert_count = 0;
+#if PTO2_TENSORMAP_PROFILING
+uint64_t g_lookup_chain_total = 0;
+uint64_t g_lookup_count = 0;
+int32_t  g_lookup_chain_max = 0;
+uint64_t g_lookup_overlap_checks = 0;
+uint64_t g_lookup_overlap_hits = 0;
+uint64_t g_insert_count = 0;
 #endif
 
 // =============================================================================
@@ -253,7 +253,7 @@ void PTO2TensorMap::sync_tensormap() {
 // =============================================================================
 // TensorMap Lookup Profiling
 // =============================================================================
-#if PTO2_ORCH_PROFILING
+#if PTO2_TENSORMAP_PROFILING
 PTO2TensorMapProfilingData pto2_tensormap_get_profiling() {
     PTO2TensorMapProfilingData d;
     d.lookup_chain_total = g_lookup_chain_total;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 7b22c6e62..7e4d7ce06 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -38,6 +38,22 @@
 
 struct PTO2OrchestratorState;  // forward declare
 
+// =============================================================================
+// TensorMap Lookup Profiling (must precede inline lookup/insert methods)
+// =============================================================================
+#ifndef PTO2_TENSORMAP_PROFILING
+#define PTO2_TENSORMAP_PROFILING 0
+#endif
+
+#if PTO2_TENSORMAP_PROFILING
+extern uint64_t g_lookup_chain_total;
+extern uint64_t g_lookup_count;
+extern int32_t  g_lookup_chain_max;
+extern uint64_t g_lookup_overlap_checks;
+extern uint64_t g_lookup_overlap_hits;
+extern uint64_t g_insert_count;
+#endif
+
 // =============================================================================
 // TensorMap Structure
 // =============================================================================
@@ -212,8 +228,15 @@ struct PTO2TensorMap {
         PTO2TensorMapEntry* cur_entry = *prev_ptr;
 
         result.count = 0;
+#if PTO2_TENSORMAP_PROFILING
+        g_lookup_count++;
+        int32_t chain_len = 0;
+#endif
 
         while (cur_entry != nullptr) {
+#if PTO2_TENSORMAP_PROFILING
+            chain_len++;
+#endif
             // Check validity first
             if (!entry_valid(*cur_entry)) {
                 // ========== STALE ENTRY: Truncate chain here ==========
@@ -227,16 +250,22 @@ struct PTO2TensorMap {
                     remove_entry(*cur_entry);
                     cur_entry = next_entry;
                 }
-                return;
+                break;
             }
 
             // Entry is valid - check if regions OVERLAP (not just exact match)
             // Since we hash only by base_ptr, all entries in this bucket have
             // potential to overlap. We must check actual byte-range overlap.
             if (query_tensor_data.buffer.addr == cur_entry->addr) {
+#if PTO2_TENSORMAP_PROFILING
+                g_lookup_overlap_checks++;
+#endif
                 auto overlap_status = query_tensor_data.is_overlap(cur_entry->tensor.data());
                 if (overlap_status != OverlapStatus::NO_OVERLAP) {
                     result.push(cur_entry, overlap_status);
+#if PTO2_TENSORMAP_PROFILING
+                    g_lookup_overlap_hits++;
+#endif
                 }
             }
 
@@ -244,6 +273,10 @@ struct PTO2TensorMap {
             prev_ptr = &cur_entry->next_in_bucket;
             cur_entry = *prev_ptr;
         }
+#if PTO2_TENSORMAP_PROFILING
+        g_lookup_chain_total += chain_len;
+        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
+#endif
     }
 
     /**
@@ -256,6 +289,9 @@ struct PTO2TensorMap {
      * @param producer_task_id  Task ID of producer
      */
     void insert(const Tensor& tensor, int32_t producer_task_id, bool with_alloc) {
+#if PTO2_TENSORMAP_PROFILING
+        g_insert_count++;
+#endif
         // Allocate entry from ring buffer pool
         PTO2TensorMapEntry* entry = new_entry();
 
@@ -394,14 +430,7 @@ struct PTO2TensorMap {
     void sync_tensormap();
 };
 
-// =============================================================================
-// TensorMap Lookup Profiling
-// =============================================================================
-#ifndef PTO2_ORCH_PROFILING
-#define PTO2_ORCH_PROFILING 1
-#endif
-
-#if PTO2_ORCH_PROFILING
+#if PTO2_TENSORMAP_PROFILING
 struct PTO2TensorMapProfilingData {
     uint64_t lookup_chain_total;
     uint64_t lookup_count;
diff --git a/tools/sched_overhead_analysis.py b/tools/sched_overhead_analysis.py
index e6ea363df..51e9b35af 100644
--- a/tools/sched_overhead_analysis.py
+++ b/tools/sched_overhead_analysis.py
@@ -37,19 +37,11 @@ def parse_scheduler_threads(log_path):
 
     Expected log format (per thread):
         Thread N: completed=X tasks in Yus (Z loops, W tasks/loop)
-        Thread N: --- Phase Breakdown (execution order) ---
-        Thread N:   scan:            Xus (Y%)
-        Thread N:   early_ready:     Xus (Y%)  (deps already met at submit time)
-        Thread N:   complete:        Xus (Y%)  [fanout: edges=A, max_degree=B, avg=C]
-        Thread N:   dispatch:        Xus (Y%)  [steal: own=A, steal=B, pct=C%]
-        Thread N: --- Lock Contention (ready_q) ---
-        Thread N:   total:         wait= Xus hold= Yus
-        Thread N:   scan:          wait= Xus hold= Yus
-        Thread N:   early_ready:   wait= Xus hold= Yus
-        Thread N:   complete:      wait= Xus hold= Yus
-        Thread N:   dispatch:      wait= Xus hold= Yus
-        Thread N:     hit:         wait= Xus hold= Yus (dequeued task)
-        Thread N:     miss:        wait= Xus hold= Yus (empty queue)
+        Thread N: --- Phase Breakdown ---
+        Thread N:   complete:    Xus (Y%)  [fanout: edges=A, max_degree=B, avg=C]  [fanin: edges=D, max_degree=E, avg=F]
+        Thread N:   scan:        Xus (Y%)
+        Thread N:   dispatch:    Xus (Y%)  [pop: hit=A, miss=B, hit_rate=C%]
+        Thread N:   idle:        Xus (Y%)
     """
     threads = {}
     with open(log_path, 'r', errors='ignore') as f:
@@ -65,24 +57,12 @@ def parse_scheduler_threads(log_path):
                     'tasks_per_loop': float(m.group(5)),
                 }
 
-            # Phase: scan (distinguished from lock scan by absence of "wait=")
-            m = re.search(r'Thread (\d+):\s+scan:\s+([\d.]+)us \(\s*([\d.]+)%\)', line)
-            if m:
-                tid = int(m.group(1))
-                if tid in threads:
-                    threads[tid]['scan_us'] = float(m.group(2))
-                    threads[tid]['scan_pct'] = float(m.group(3))
-
-            # Phase: early_ready
-            m = re.search(r'Thread (\d+):\s+early_ready:\s+([\d.]+)us \(\s*([\d.]+)%\)', line)
-            if m:
-                tid = int(m.group(1))
-                if tid in threads:
-                    threads[tid]['early_ready_us'] = float(m.group(2))
-                    threads[tid]['early_ready_pct'] = float(m.group(3))
-
-            # Phase: complete [fanout: edges=X, max_degree=Y, avg=Z]
-            m = re.search(r'Thread (\d+):\s+complete:\s+([\d.]+)us \(\s*([\d.]+)%\)\s+\[fanout: edges=(\d+), max_degree=(\d+), avg=([\d.]+)\]', line)
+            # Phase: complete [fanout: edges=X, max_degree=Y, avg=Z]  [fanin: edges=D, max_degree=E, avg=F]
+            m = re.search(
+                r'Thread (\d+):\s+complete:\s+([\d.]+)us \(\s*([\d.]+)%\)'
+                r'\s+\[fanout: edges=(\d+), max_degree=(\d+), avg=([\d.]+)\]'
+                r'\s+\[fanin: edges=(\d+), max_degree=(\d+), avg=([\d.]+)\]',
+                line)
             if m:
                 tid = int(m.group(1))
                 if tid in threads:
@@ -91,73 +71,36 @@ def parse_scheduler_threads(log_path):
                     threads[tid]['fanout_edges'] = int(m.group(4))
                     threads[tid]['fanout_max_degree'] = int(m.group(5))
                     threads[tid]['fanout_avg'] = float(m.group(6))
+                    threads[tid]['fanin_edges'] = int(m.group(7))
+                    threads[tid]['fanin_max_degree'] = int(m.group(8))
+                    threads[tid]['fanin_avg'] = float(m.group(9))
 
-            # Phase: dispatch [steal: own=X, steal=Y, pct=Z%]
-            m = re.search(r'Thread (\d+):\s+dispatch:\s+([\d.]+)us \(\s*([\d.]+)%\)\s+\[steal: own=(\d+), steal=(\d+), pct=([\d.]+)%\]', line)
-            if m:
-                tid = int(m.group(1))
-                if tid in threads:
-                    threads[tid]['dispatch_us'] = float(m.group(2))
-                    threads[tid]['dispatch_pct'] = float(m.group(3))
-                    threads[tid]['steal_own'] = int(m.group(4))
-                    threads[tid]['steal_steal'] = int(m.group(5))
-                    threads[tid]['steal_pct'] = float(m.group(6))
-
-            # Lock: total
-            m = re.search(r'Thread (\d+):\s+total:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line)
-            if m:
-                tid = int(m.group(1))
-                if tid in threads:
-                    threads[tid]['lock_wait_us'] = int(m.group(2))
-                    threads[tid]['lock_hold_us'] = int(m.group(3))
-
-            # Lock: scan
-            m = re.search(r'Thread (\d+):\s+scan:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line)
-            if m:
-                tid = int(m.group(1))
-                if tid in threads:
-                    threads[tid]['lock_scan_wait'] = int(m.group(2))
-                    threads[tid]['lock_scan_hold'] = int(m.group(3))
-
-            # Lock: early_ready
-            m = re.search(r'Thread (\d+):\s+early_ready:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line)
-            if m:
-                tid = int(m.group(1))
-                if tid in threads:
-                    threads[tid]['lock_early_ready_wait'] = int(m.group(2))
-                    threads[tid]['lock_early_ready_hold'] = int(m.group(3))
-
-            # Lock: complete
-            m = re.search(r'Thread (\d+):\s+complete:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line)
-            if m:
-                tid = int(m.group(1))
-                if tid in threads:
-                    threads[tid]['lock_complete_wait'] = int(m.group(2))
-                    threads[tid]['lock_complete_hold'] = int(m.group(3))
-
-            # Lock: dispatch
-            m = re.search(r'Thread (\d+):\s+dispatch:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line)
+            # Phase: scan
+            m = re.search(r'Thread (\d+):\s+scan:\s+([\d.]+)us \(\s*([\d.]+)%\)', line)
             if m:
                 tid = int(m.group(1))
                 if tid in threads:
-                    threads[tid]['lock_dispatch_wait'] = int(m.group(2))
-                    threads[tid]['lock_dispatch_hold'] = int(m.group(3))
+                    threads[tid]['scan_us'] = float(m.group(2))
+                    threads[tid]['scan_pct'] = float(m.group(3))
 
-            # Lock: dispatch hit
-            m = re.search(r'Thread (\d+):\s+hit:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line)
+            # Phase: dispatch [pop: hit=X, miss=Y, hit_rate=Z%]
+            m = re.search(r'Thread (\d+):\s+dispatch:\s+([\d.]+)us \(\s*([\d.]+)%\)\s+\[pop: hit=(\d+), miss=(\d+), hit_rate=([\d.]+)%\]', line)
             if m:
                 tid = int(m.group(1))
                 if tid in threads:
-                    threads[tid]['lock_dispatch_hit_wait'] = int(m.group(2))
-                    threads[tid]['lock_dispatch_hit_hold'] = int(m.group(3))
+                    threads[tid]['dispatch_us'] = float(m.group(2))
+                    threads[tid]['dispatch_pct'] = float(m.group(3))
+                    threads[tid]['pop_hit'] = int(m.group(4))
+                    threads[tid]['pop_miss'] = int(m.group(5))
+                    threads[tid]['pop_hit_rate'] = float(m.group(6))
 
-            # Lock: dispatch miss
-            m = re.search(r'Thread (\d+):\s+miss:\s+wait=\s*(\d+)us hold=\s*(\d+)us', line)
+            # Phase: idle
+            m = re.search(r'Thread (\d+):\s+idle:\s+([\d.]+)us \(\s*([\d.]+)%\)', line)
             if m:
                 tid = int(m.group(1))
                 if tid in threads:
-                    threads[tid]['lock_dispatch_miss_wait'] = int(m.group(2))
-                    threads[tid]['lock_dispatch_miss_hold'] = int(m.group(3))
+                    threads[tid]['idle_us'] = float(m.group(2))
+                    threads[tid]['idle_pct'] = float(m.group(3))
 
     return threads
 
@@ -304,12 +247,12 @@ def run_analysis(perf_path, log_path, print_sources=True, selection_strategy=Non
     print()
 
     # Phase breakdown
-    phases = ['scan', 'early_ready', 'complete', 'dispatch']
+    phases = ['complete', 'scan', 'dispatch', 'idle']
     phase_labels = {
-        'scan':        'Scan (discover new root tasks)',
-        'early_ready': 'Early ready (deps met at submit time)',
-        'complete':    'Complete (poll handshake, resolve fanout)',
+        'complete':    'Complete (poll handshake, resolve deps)',
+        'scan':        'Scan (update perf header)',
         'dispatch':    'Dispatch (pop queue, build payload, flush)',
+        'idle':        'Idle (spinning, no progress)',
     }
 
     fmt3 = "  {:<50} {:>11} {:>10} {:>14}"
@@ -328,42 +271,22 @@ def run_analysis(perf_path, log_path, print_sources=True, selection_strategy=Non
     # Fanout stats (from complete phase)
     fanout_edges = sum(t.get('fanout_edges', 0) for t in threads.values())
     fanout_max = max((t.get('fanout_max_degree', 0) for t in threads.values()), default=0)
-    fanout_avg_weighted = sum(t.get('fanout_avg', 0) * t.get('fanout_edges', 0) for t in threads.values())
-    fanout_avg = fanout_avg_weighted / fanout_edges if fanout_edges > 0 else 0
-    print(f'  Fanout: total edges={fanout_edges}, max_degree={fanout_max}, avg_degree={fanout_avg:.1f}')
-    print()
-
-    # Work stealing stats (from dispatch phase)
-    steal_own = sum(t.get('steal_own', 0) for t in threads.values())
-    steal_steal = sum(t.get('steal_steal', 0) for t in threads.values())
-    steal_total = steal_own + steal_steal
-    steal_pct = steal_steal / steal_total * 100 if steal_total > 0 else 0
-    print(f'  Work stealing: own={steal_own}, stolen={steal_steal} ({steal_pct:.1f}% steal rate)')
-    print()
-
-    # Lock contention breakdown
-    fmt4 = "  {:<50} {:>11} {:>10}"
-    print(fmt4.format('Lock contention (ready_q)', 'Total (us)', '% of total'))
-    print('  ' + '-' * 75)
-    lock_wait = sum(t.get('lock_wait_us', 0) for t in threads.values())
-    lock_hold = sum(t.get('lock_hold_us', 0) for t in threads.values())
-    print(fmt4.format('  wait (spinning for lock)', str(lock_wait), f'{lock_wait/total_us*100:.1f}%' if total_us > 0 else '0.0%'))
-    print(fmt4.format('  hold (inside critical section)', str(lock_hold), f'{lock_hold/total_us*100:.1f}%' if total_us > 0 else '0.0%'))
+    fanout_avg = fanout_edges / total_completed if total_completed > 0 else 0
+    print(f'  Fanout (notify consumers): total edges={fanout_edges}, max_degree={fanout_max}, avg_degree={fanout_avg:.1f}')
+
+    # Fanin stats (from complete phase)
+    fanin_edges = sum(t.get('fanin_edges', 0) for t in threads.values())
+    fanin_max = max((t.get('fanin_max_degree', 0) for t in threads.values()), default=0)
+    fanin_avg = fanin_edges / total_completed if total_completed > 0 else 0
+    print(f'  Fanin  (release producers): total edges={fanin_edges}, max_degree={fanin_max}, avg_degree={fanin_avg:.1f}')
     print()
 
-    # Lock wait breakdown by phase
-    print('  Lock wait by phase:')
-    for p in phases:
-        w = sum(t.get(f'lock_{p}_wait', 0) for t in threads.values())
-        h = sum(t.get(f'lock_{p}_hold', 0) for t in threads.values())
-        print(f'    {p:<14}  wait={w:>6} us  hold={h:>6} us')
-    # Dispatch hit/miss sub-breakdown
-    hit_w = sum(t.get('lock_dispatch_hit_wait', 0) for t in threads.values())
-    hit_h = sum(t.get('lock_dispatch_hit_hold', 0) for t in threads.values())
-    miss_w = sum(t.get('lock_dispatch_miss_wait', 0) for t in threads.values())
-    miss_h = sum(t.get('lock_dispatch_miss_hold', 0) for t in threads.values())
-    print(f'      {"hit":<12}  wait={hit_w:>6} us  hold={hit_h:>6} us  (dequeued task)')
-    print(f'      {"miss":<12}  wait={miss_w:>6} us  hold={miss_h:>6} us  (empty queue)')
+    # Pop stats (from dispatch phase)
+    pop_hit = sum(t.get('pop_hit', 0) for t in threads.values())
+    pop_miss = sum(t.get('pop_miss', 0) for t in threads.values())
+    pop_total = pop_hit + pop_miss
+    pop_hit_rate = pop_hit / pop_total * 100 if pop_total > 0 else 0
+    print(f'  Pop: hit={pop_hit}, miss={pop_miss}, hit_rate={pop_hit_rate:.1f}%')
 
     print()
     print('=' * 90)
@@ -391,75 +314,29 @@ def run_analysis(perf_path, log_path, print_sources=True, selection_strategy=Non
     avg_tail_oh = sum(tails) / n
     loop_ratio = avg_tail_oh / avg_loop_us if avg_loop_us > 0 else 0
     print(f'  Avg scheduler loop iteration: {avg_loop_us:.1f} us (approx avg polling interval per loop)')
-    if n_threads > 0:
-        print(f'  With {n_threads} threads sharing {total_loops} loops over {total_us/n_threads:.0f} us wall each:')
-    print()
-
-    print('  Scheduler CPU time breakdown (per completed task):')
-
-    # Build phase data with sub-items for sorting
-    phase_details = {
-        'dispatch': {
-            'label': 'Dispatch phase (build payload + cache flush)',
-            'total': phase_totals.get('dispatch', 0),
-            'sub_items': [
-                ('Lock wait (ready_q pop)', sum(t.get('lock_dispatch_wait', 0) for t in threads.values())),
-                ('Lock hold + build + dc cvac/civac + dsb sy', phase_totals.get('dispatch', 0) - sum(t.get('lock_dispatch_wait', 0) for t in threads.values())),
-            ]
-        },
-        'complete': {
-            'label': 'Complete phase (poll + fanout resolve)',
-            'total': phase_totals.get('complete', 0),
-            'sub_items': [
-                ('Lock wait (ready_q push)', sum(t.get('lock_complete_wait', 0) for t in threads.values())),
-                ('Fanout traversal + atomic ops', phase_totals.get('complete', 0) - sum(t.get('lock_complete_wait', 0) for t in threads.values())),
-            ]
-        },
-        'scan': {
-            'label': 'Scan phase (new task discovery)',
-            'total': phase_totals.get('scan', 0),
-            'sub_items': []
-        },
-        'early_ready': {
-            'label': 'Early ready (deps met at submit time)',
-            'total': phase_totals.get('early_ready', 0),
-            'sub_items': []
-        },
-    }
-
-    # Sort by total descending
-    for _, detail in sorted(phase_details.items(), key=lambda x: x[1]['total'], reverse=True):
-        per_task = detail['total'] / total_completed if total_completed > 0 else 0
-        pct = detail['total'] / total_us * 100 if total_us > 0 else 0
-        print(f'    - {detail["label"]:<50} {per_task:.2f} us/task  ({pct:.1f}% of scheduler CPU)')
-        for sub_label, sub_total in detail['sub_items']:
-            sub_per_task = sub_total / total_completed if total_completed > 0 else 0
-            print(f'        {sub_label:<48} {sub_per_task:.2f} us/task')
-
     print()
     print(f'  Avg Tail OH = {avg_tail_oh:.1f} us ~= {loop_ratio:.1f} x avg loop iteration ({avg_loop_us:.1f} us)')
     print(f'  -> On average, a completed task waits ~{loop_ratio:.1f} loop iterations before being detected')
     print()
 
-    # Data-driven insight: find the dominant phase (excluding early_ready which is typically trivial)
+    # Data-driven insight: find the dominant phase (excluding idle which is not useful work)
     work_phases = {p: phase_totals.get(p, 0) for p in ['scan', 'complete', 'dispatch']}
     dominant_phase = max(work_phases, key=work_phases.get)
     dominant_pct = work_phases[dominant_phase] / total_us * 100 if total_us > 0 else 0
     print(f'  Key insight: {phase_labels[dominant_phase].split(" (")[0]} phase consumes ~{dominant_pct:.0f}% of scheduler CPU.')
     if dominant_phase == 'dispatch':
-        dispatch_total = phase_totals.get('dispatch', 0)
-        dispatch_lock_pct = sum(t.get('lock_dispatch_wait', 0) for t in threads.values()) / dispatch_total * 100 if dispatch_total > 0 else 0
-        print(f'  Within dispatch, lock contention accounts for {dispatch_lock_pct:.0f}% of time.')
-        if miss_w > hit_w:
-            print(f'  Dispatch miss (empty queue) dominates lock wait: miss={miss_w}us vs hit={hit_w}us.')
-        print('  Cache flush (dc cvac + dsb sy) is the dominant non-lock cost.')
+        print(f'  Pop hit_rate={pop_hit_rate:.1f}%: {"low hit rate suggests ready queue often empty" if pop_hit_rate < 50 else "good hit rate"}.')
+        print('  Cache flush (dc cvac + dsb sy) is the dominant non-pop cost.')
     elif dominant_phase == 'complete':
-        complete_total = phase_totals.get('complete', 0)
-        complete_lock_pct = sum(t.get('lock_complete_wait', 0) for t in threads.values()) / complete_total * 100 if complete_total > 0 else 0
-        print(f'  Within complete, lock contention accounts for {complete_lock_pct:.0f}% of time.')
-        print('  Fanout traversal and atomic ops dominate the non-lock cost.')
+        total_edges = fanout_edges + fanin_edges
+        print(f'  Fanout: avg_degree={fanout_avg:.1f}, max_degree={fanout_max}.')
+        print(f'  Fanin:  avg_degree={fanin_avg:.1f}, max_degree={fanin_max}.')
+        if fanin_edges > fanout_edges:
+            print('  Fanin traversal (release_producer + check_consumed) dominates the complete phase.')
+        else:
+            print('  Fanout traversal and atomic ops dominate the complete phase.')
     elif dominant_phase == 'scan':
-        print('  Scan phase overhead indicates too many root tasks or inefficient task graph traversal.')
+        print('  Scan phase overhead indicates frequent perf header updates.')
     print('=' * 90)
 
     return 0
diff --git a/tools/swimlane_converter.py b/tools/swimlane_converter.py
index db1ec714c..4de82b6e9 100644
--- a/tools/swimlane_converter.py
+++ b/tools/swimlane_converter.py
@@ -137,10 +137,20 @@ def parse_sched_cpu_from_device_log(log_path, task_count):
     if total_sched_cpu_us <= 0:
         return None
 
-    return total_sched_cpu_us / task_count
+    total_completed = sum(t.get('completed', 0) for t in threads.values())
+    if task_count > 0 and total_completed > 0 and abs(total_completed - task_count) / task_count > 0.5:
+        print(f"Warning: device log has {total_completed} completed tasks "
+              f"but perf JSON has {task_count}; skipping Sched CPU metric "
+              f"(device log may be from a different run)", file=sys.stderr)
+        return None
+
+    return {
+        'us_per_task': total_sched_cpu_us / task_count,
+        'num_sched_threads': len(threads),
+    }
 
 
-def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=None):
+def print_task_statistics(tasks, func_id_to_name=None, sched_info=None):
     """Print task statistics grouped by func_id.
 
     Exec = kernel execution time (end_time_us - start_time_us) on AICore.
@@ -152,8 +162,8 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non
     Args:
         tasks: List of task dicts
         func_id_to_name: Optional dict mapping func_id to function name
-        sched_cpu_us_per_task: Optional AICPU scheduler CPU time per task (us),
-            parsed from device log
+        sched_info: Optional dict with 'us_per_task' (float) and 'num_sched_threads' (int),
+            parsed from device log by parse_sched_cpu_from_device_log()
     """
     from collections import defaultdict
 
@@ -204,13 +214,12 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non
             max_finish_time = max(max_finish_time, finish_time)
 
     # Print statistics
-    print("\n" + "=" * 160)
+    print("\n" + "=" * 110)
     print("Task Statistics by Function")
     print("  Exec = kernel time on AICore; Latency = dispatch->finish (incl. head OH + Exec + tail OH)")
-    print("=" * 160)
-    print(f"{'Func_ID':<8} {'Func_Name':<12} {'Count':^6} {'Total_Exec/Latency(us)':^25} {'Avg_Exec/Latency(us)':^23} "
-          f"{'Min_Exec/Latency(us)':^23} {'Max_Exec/Latency(us)':^23} {'Avg_Head/Tail_OH(us)':^23} {'Exec_%':^8}")
-    print("-" * 160)
+    print("=" * 110)
+    print(f"{'Func_ID':<8} {'Func_Name':<12} {'Count':>5}   {'Avg Exec(us)':>12}  {'Avg Latency(us)':>15}  {'Exec%':>6}   {'Avg Head OH(us)':>15}  {'Avg Tail OH(us)':>15}")
+    print("-" * 110)
 
     # Sort by func_id for consistent output
     total_count = 0
@@ -222,8 +231,6 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non
         count = len(durations)
         sum_duration = sum(durations)
         avg_duration = sum_duration / count
-        min_duration = min(durations)
-        max_duration = max(durations)
 
         # Accumulate totals
         total_count += count
@@ -235,34 +242,22 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non
         else:
             func_name = f"Func_{func_id}"
 
-        # Calculate averages for new metrics
+        # Calculate averages
         avg_head_overhead = sum(stats['head_overheads']) / len(stats['head_overheads']) if stats['head_overheads'] else 0
         avg_tail_overhead = sum(stats['tail_overheads']) / len(stats['tail_overheads']) if stats['tail_overheads'] else 0
         avg_latency = stats['total_latency'] / count if count > 0 else 0
-        min_latency = min(stats['latencies']) if stats['latencies'] else 0
-        max_latency = max(stats['latencies']) if stats['latencies'] else 0
-        total_latency = stats['total_latency']
 
         # Calculate execution ratio: total_exec_time / total_latency
         exec_ratio = (stats['total_exec_time'] / stats['total_latency'] * 100) if stats['total_latency'] > 0 else 0
 
-        # Format combined exec/latency values
-        total_combined = f"{sum_duration:.2f}/{total_latency:.2f}"
-        avg_combined = f"{avg_duration:.2f}/{avg_latency:.2f}"
-        min_combined = f"{min_duration:.2f}/{min_latency:.2f}"
-        max_combined = f"{max_duration:.2f}/{max_latency:.2f}"
-        overhead_combined = f"{avg_head_overhead:.2f}/{avg_tail_overhead:.2f}"
-
-        print(f"{func_id:<8} {func_name:<12} {count:^6} {total_combined:^25} {avg_combined:^23} "
-              f"{min_combined:^23} {max_combined:^23} {overhead_combined:^23} {exec_ratio:^7.2f}%")
+        print(f"{func_id:<8} {func_name:<12} {count:>5}   {avg_duration:>12.2f}  {avg_latency:>15.2f}  {exec_ratio:>5.1f}%   {avg_head_overhead:>15.2f}  {avg_tail_overhead:>15.2f}")
 
     # Print total row
-    print("-" * 160)
+    print("-" * 110)
 
     # Calculate total latency (sum of all latencies)
     total_latency_sum = sum(stats['total_latency'] for stats in func_stats.values())
-    total_combined = f"{total_duration:.2f}/{total_latency_sum:.2f}"
-    print(f"{'TOTAL':<21} {total_count:^6} {total_combined:^25}")
+    print(f"{'TOTAL':<21} {total_count:>5}   {total_duration:>12.2f}  {total_latency_sum:>15.2f}")
 
     # Print total test execution time
     if min_dispatch_time != float('inf') and max_finish_time != float('-inf'):
@@ -276,12 +271,19 @@ def print_task_statistics(tasks, func_id_to_name=None, sched_cpu_us_per_task=Non
         exec_latency_ratio_pct = total_duration / total_latency_sum * 100
         print("\n--- Task execution vs Scheduler overhead ---")
         print(f"  Per-task (all):  Avg Exec = {avg_exec_us:.2f} us,  Avg Latency (dispatch->finish) = {avg_latency_us:.2f} us,  Exec/Latency = {exec_latency_ratio_pct:.2f}%")
-        if sched_cpu_us_per_task is not None:
-            exec_sched_cpu_ratio = (avg_exec_us / sched_cpu_us_per_task * 100) if sched_cpu_us_per_task > 0 else 0
-            print(f"  Sched CPU (from device log): {sched_cpu_us_per_task:.2f} us/task  (Exec/Sched_CPU = {exec_sched_cpu_ratio:.2f}%)")
-        print("  (Latency = dispatch to finish; Sched CPU = AICPU scheduler thread CPU time per task, from device log.)")
+        if sched_info is not None:
+            sched_cpu = sched_info['us_per_task']
+            num_cores = len(set(t['core_id'] for t in tasks))
+            exec_sched_ratio = (avg_exec_us / sched_cpu * 100) if sched_cpu > 0 else 0
+            per_core_exec = avg_exec_us / num_cores if num_cores > 0 else 0
+            per_core_ratio = (per_core_exec / sched_cpu * 100) if sched_cpu > 0 else 0
+            num_threads = sched_info['num_sched_threads']
+            print(f"  Sched CPU (from device log): {sched_cpu:.2f} us/task  (Exec/Sched = {exec_sched_ratio:.1f}%, PerCore/Sched = {per_core_ratio:.1f}%)")
+            print(f"  (Latency = dispatch→finish; Sched CPU = scheduler thread CPU per task; PerCore = avg_exec/{num_cores}_cores vs sched_cpu, {num_threads} sched threads)")
+        else:
+            print("  (Latency = dispatch→finish; Sched CPU = scheduler thread CPU per task)")
 
-    print("=" * 160)
+    print("=" * 110)
 
 
 
@@ -564,7 +566,7 @@ def generate_chrome_trace_json(tasks, output_path, func_id_to_name=None, verbose
             "complete": "good",       # green
             "dispatch": "terrible",   # red
             "scan": "thread_state_running",  # blue
-            "early_ready": "yellow",  # yellow
+            "idle": "yellow",  # yellow
         }
 
         for thread_idx, thread_records in enumerate(scheduler_phases):
@@ -972,14 +974,14 @@ def main():
             print(f"Selection: {log_strategy}")
 
         # Optional: parse sched CPU from device log
-        sched_cpu_us = None
+        sched_info = None
         if resolved_device_log is not None:
-            sched_cpu_us = parse_sched_cpu_from_device_log(resolved_device_log, len(data['tasks']))
-            if args.verbose and sched_cpu_us is not None:
-                print(f"  Parsed sched CPU from device log: {sched_cpu_us:.2f} us/task")
+            sched_info = parse_sched_cpu_from_device_log(resolved_device_log, len(data['tasks']))
+            if args.verbose and sched_info is not None:
+                print(f"  Parsed sched CPU from device log: {sched_info['us_per_task']:.2f} us/task")
 
         # Print task statistics (incl. task execution vs scheduler overhead)
-        print_task_statistics(data['tasks'], func_names, sched_cpu_us_per_task=sched_cpu_us)
+        print_task_statistics(data['tasks'], func_names, sched_info=sched_info)
 
         # Integrated deep-dive overhead analysis
         if resolved_device_log is not None: