hw-native-sys
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/aiv/kernel_consumer.cpp‎
Lines changed: 10 additions & 8 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/aiv/kernel_consumer.cpp‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp‎
Lines changed: 46 additions & 0 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/kernel_config.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/kernel_config.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp‎
Lines changed: 29 additions & 12 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp‎
Lines changed: 29 additions & 12 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/aiv/kernel_moe_recv_assemble.cpp‎
Lines changed: 17 additions & 14 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/aiv/kernel_moe_recv_assemble.cpp‎
Lines changed: 17 additions & 14 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/aiv/kernel_notify_wait.cpp‎
Lines changed: 47 additions & 0 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/aiv/kernel_notify_wait.cpp‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/kernel_config.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/kernel_config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/orchestration/moe_dispatch_orchestration.cpp‎
Lines changed: 29 additions & 11 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/orchestration/moe_dispatch_orchestration.cpp‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp‎
Lines changed: 1 addition & 15 deletions b/‎src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp‎
Lines changed: 1 addition & 15 deletions
@@ -3,13 +3,14 @@
  *
  * Implements: result[i] = src[i] + notify_counter[0]
  *
- * This kernel is launch-gated: the scheduler only promotes it to READY after
- * both its fanin (producer complete) AND local notification counter >= 1.
+ * Depends on NotifyWait completing (via dummy tensor), guaranteeing
+ * the local notification counter >= 1 before this kernel runs.
  *
  * Kernel args layout (packed by scheduler):
- *   args[0] = &Tensor(src)            — input tensor struct pointer (producer's output)
- *   args[1] = &Tensor(result)         — output tensor struct pointer
- *   args[2] = notify_counter_addr     — local notify counter (window memory)
+ *   args[0] = &Tensor(dummy_notify)   — input (dependency token from NotifyWait)
+ *   args[1] = &Tensor(src)            — input tensor struct pointer (producer's output)
+ *   args[2] = &Tensor(result)         — output tensor struct pointer
+ *   args[3] = notify_counter_addr     — local notify counter (window memory)
  */
 
 #include <cstdint>
@@ -28,9 +29,10 @@ using namespace pto;
 #endif
 
 extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args) {
-    __gm__ Tensor* src_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
-    __gm__ Tensor* result_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
-    __gm__ int32_t* notify_counter = reinterpret_cast<__gm__ int32_t*>(args[2]);
+    // args[0] = dummy_notify tensor (dependency token, unused)
+    __gm__ Tensor* src_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* result_tensor = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ int32_t* notify_counter = reinterpret_cast<__gm__ int32_t*>(args[3]);
 
     __gm__ float* src =
         reinterpret_cast<__gm__ float*>(src_tensor->buffer.addr) + src_tensor->start_offset;
 
@@ -0,0 +1,46 @@
+/**
+ * NotifyWait Kernel — register notification counter as CQ condition (func_id=2)
+ *
+ * Trivial deferred-completion kernel: registers a COUNTER wait condition
+ * for the notification counter, then returns immediately. The scheduler
+ * polls the counter via the CQ mechanism and completes this task once
+ * *notify_counter >= expected_value.
+ *
+ * Kernel args layout:
+ *   args[0] = &Tensor(dummy_notify)    — output (dependency token for downstream)
+ *   args[1] = notify_counter_addr      — scalar (GM int32* to poll)
+ *   args[2] = expected_value           — scalar (threshold)
+ *   args[3] = cq_addr                  — scalar (auto-appended by deferred submit)
+ */
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+#include "pto_cq_kernel_api.h"
+
+extern "C" __aicore__ __attribute__((always_inline))
+void kernel_entry(__gm__ int64_t* args) {
+    uint64_t notify_counter_addr = static_cast<uint64_t>(args[1]);
+    uint32_t expected_value = static_cast<uint32_t>(args[2]);
+    uint64_t cq_addr = static_cast<uint64_t>(args[3]);
+
+    volatile __gm__ PTO2CompletionQueue* cq = pto2_cq_get(cq_addr);
+    pto2_cq_reset(cq);
+    pto2_save_expected_completion(PTO2_ENGINE_SDMA, cq,
+                                  notify_counter_addr, expected_value);
+    // Flush CQ writes from AICore data cache to GM so the AICPU scheduler
+    // can read them.  pto2_cq_flush's #if-defined guards don't fire because
+    // the constants are C++ enums, not macros — call intrinsics directly.
+    dcci((__gm__ int32_t*)cq, cache_line_t::ENTIRE_DATA_CACHE, dcci_dst_t::CACHELINE_OUT);
+    dsb(DSB_DDR);
+    pipe_barrier(PIPE_ALL);
+}
@@ -2,7 +2,8 @@
 Async Notify Demo - Kernel and Orchestration Configuration
 
 Two hardware cards use TNOTIFY(AtomicAdd) for inter-rank notification.
-The consumer is launch-gated on the local notification counter >= 1.
+The consumer depends on a deferred NotifyWait task that polls the
+local notification counter >= 1 via the CQ mechanism.
 """
 
 import os
@@ -22,6 +23,7 @@
 KERNELS = [
     {"func_id": 0, "source": str(_KERNELS_ROOT / "aiv" / "kernel_producer_notify.cpp"), "core_type": "aiv"},
     {"func_id": 1, "source": str(_KERNELS_ROOT / "aiv" / "kernel_consumer.cpp"), "core_type": "aiv"},
+    {"func_id": 2, "source": str(_KERNELS_ROOT / "aiv" / "kernel_notify_wait.cpp"), "core_type": "aiv"},
 ]
 
 RUNTIME_CONFIG = {
 
@@ -2,12 +2,14 @@
  * Async Notify Demo - Device-side orchestration
  *
  * Two-card hardware mode:
- *   t0 (producer): out = in * 2, then TNOTIFY(AtomicAdd) the peer's window
- *                  counter. Completes normally (no deferred completion).
- *   t1 (consumer, launch-gated): result = out + notify_counter.
- *                  Gated by local notification counter >= 1.
- *                  The scheduler only promotes this task to READY after both
- *                  its fanin is satisfied AND the local counter reaches 1.
+ *   t0 (producer, func_id=0): out = in * 2, then TNOTIFY(AtomicAdd) the
+ *                  peer's window counter. Completes normally (RTC).
+ *   t1 (notify_wait, func_id=2, deferred): registers notification counter
+ *                  condition (counter >= 1) via CQ, returns immediately.
+ *                  Produces dummy_notify tensor for dependency chain.
+ *   t2 (consumer, func_id=1): result = out + notify_counter.
+ *                  Depends on both producer (via ext_out) and notify_wait
+ *                  (via dummy_notify), ensuring counter >= 1 before reading.
  *
  * The notify counter is pre-zeroed by the distributed runner input loader.
  */
@@ -50,6 +52,12 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
     Tensor ext_out = make_tensor_external(out_ptr, shapes, 1, DataType::FLOAT32);
     Tensor ext_result = make_tensor_external(result_ptr, shapes, 1, DataType::FLOAT32);
 
+    uint64_t cq_notify = pto2_rt_alloc_cq();
+    if (cq_notify == 0) {
+        LOG_ERROR("async_notify_demo: rank %d failed CQ alloc", my_rank);
+        return;
+    }
+
     // Producer: normal run-to-completion task (sends TNOTIFY to peer)
     PTOParam params_producer;
     params_producer.add_input(ext_in);
@@ -58,18 +66,27 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
     params_producer.add_scalar((uint64_t)(uintptr_t)comm_ctx);
     pto2_rt_submit_aiv_task(0, params_producer);
 
-    // Consumer: launch-gated by local notification counter.
-    // After fanin (producer complete) is satisfied, the scheduler still holds
-    // this task in PTO2NotificationWaitList until *notify_counter >= 1.
+    // NotifyWait: deferred task that waits for notification counter >= 1.
+    // Produces dummy_notify so the consumer can depend on it via TensorMap.
+    uint32_t dummy_shape[1] = { 1 };
+    Tensor dummy_notify = make_tensor(dummy_shape, 1, DataType::INT32);
+
+    PTOParam params_wait;
+    params_wait.add_output(dummy_notify);
+    params_wait.add_scalar((uint64_t)(uintptr_t)notify_counter_ptr);
+    params_wait.add_scalar((uint64_t)1);
+    pto2_rt_submit_aiv_task_deferred(2, params_wait, cq_notify);
+
+    // Consumer: depends on producer (via ext_out) and notify_wait (via dummy_notify).
+    // Guaranteed notify_counter >= 1 when this task runs.
     PTOParam params_consumer;
+    params_consumer.add_input(dummy_notify);
     params_consumer.add_input(ext_out);
     params_consumer.add_output(ext_result);
     params_consumer.add_scalar((uint64_t)(uintptr_t)notify_counter_ptr);
-    pto2_rt_expect_notification_counter(params_consumer,
-                                        (uint64_t)(uintptr_t)notify_counter_ptr, 1);
     pto2_rt_submit_aiv_task(1, params_consumer);
 
-    LOG_INFO("async_notify_demo: rank %d producer=normal, consumer gated on counter=0x%lx",
+    LOG_INFO("async_notify_demo: rank %d producer=RTC, notify_wait=deferred(counter=0x%lx), consumer=RTC",
              my_rank, (uint64_t)(uintptr_t)notify_counter_ptr);
 }
 
 
@@ -1,7 +1,8 @@
 /**
- * MOE RecvAssemble Kernel — cumsum + assemble expandX (func_id=3)
+ * MOE RecvAssemble Kernel — cumsum + assemble expandX (func_id=2)
  *
- * Launch-gated on notification counter >= NUM_RANKS-1 (7 peers).
+ * Depends on NotifyWait completing (via dummy tensor input),
+ * guaranteeing notify_counter >= NUM_RANKS-1 (7 peers done).
  *
  * Reads local_counts + per-source-rank recv_counts, computes cumulative
  * sums for assembly offsets, copies token data from shmem_data slots
@@ -13,12 +14,13 @@
  *         = recv_counts[src_rank * COUNT_PAD + expert_offset]  otherwise
  *
  * Kernel args layout:
- *   args[0] = &Tensor(local_counts)     — input [COUNT_PAD] int32
- *   args[1] = &Tensor(expand_x)         — output [EXPAND_X_ROWS * HIDDEN_DIM] float
- *   args[2] = &Tensor(expert_token_nums) — output [EXPERTS_PER_RANK] int32
- *   args[3] = shmem_data_addr           — scalar (GM float* base)
- *   args[4] = recv_counts_addr          — scalar (GM int32*, [NUM_RANKS * COUNT_PAD])
- *   args[5] = CommDeviceContext*         — scalar
+ *   args[0] = &Tensor(dummy_notify)      — input (dependency token from NotifyWait)
+ *   args[1] = &Tensor(local_counts)      — input [COUNT_PAD] int32
+ *   args[2] = &Tensor(expand_x)          — output [EXPAND_X_ROWS * HIDDEN_DIM] float
+ *   args[3] = &Tensor(expert_token_nums) — output [EXPERTS_PER_RANK] int32
+ *   args[4] = shmem_data_addr            — scalar (GM float* base)
+ *   args[5] = recv_counts_addr           — scalar (GM int32*, [NUM_RANKS * COUNT_PAD])
+ *   args[6] = CommDeviceContext*          — scalar
  */
 
 #include <cstdint>
@@ -43,16 +45,17 @@ static constexpr int COUNT_PAD = 32;
 
 extern "C" __aicore__ __attribute__((always_inline))
 void kernel_entry(__gm__ int64_t* args) {
-    __gm__ Tensor* local_cnt_t = reinterpret_cast<__gm__ Tensor*>(args[0]);
-    __gm__ Tensor* expand_x_t  = reinterpret_cast<__gm__ Tensor*>(args[1]);
-    __gm__ Tensor* etn_t       = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    // args[0] = dummy_notify tensor (dependency token, unused)
+    __gm__ Tensor* local_cnt_t = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* expand_x_t  = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* etn_t       = reinterpret_cast<__gm__ Tensor*>(args[3]);
 
     __gm__ float* shmem_data =
-        reinterpret_cast<__gm__ float*>(static_cast<uintptr_t>(args[3]));
+        reinterpret_cast<__gm__ float*>(static_cast<uintptr_t>(args[4]));
     __gm__ int32_t* recv_counts =
-        reinterpret_cast<__gm__ int32_t*>(static_cast<uintptr_t>(args[4]));
+        reinterpret_cast<__gm__ int32_t*>(static_cast<uintptr_t>(args[5]));
     __gm__ CommDeviceContext* comm_ctx =
-        reinterpret_cast<__gm__ CommDeviceContext*>(static_cast<uintptr_t>(args[5]));
+        reinterpret_cast<__gm__ CommDeviceContext*>(static_cast<uintptr_t>(args[6]));
 
     __gm__ int32_t* local_counts =
         reinterpret_cast<__gm__ int32_t*>(local_cnt_t->buffer.addr) + local_cnt_t->start_offset;
 
@@ -0,0 +1,47 @@
+/**
+ * NotifyWait Kernel — register notification counter as CQ condition (func_id=3)
+ *
+ * Trivial deferred-completion kernel: registers a COUNTER wait condition
+ * for the notification counter, then returns immediately. The scheduler
+ * polls the counter via the CQ mechanism and completes this task once
+ * *notify_counter >= expected_value.
+ *
+ * Kernel args layout:
+ *   args[0] = &Tensor(dummy_notify)    — output (dependency token for downstream)
+ *   args[1] = notify_counter_addr      — scalar (GM int32* to poll)
+ *   args[2] = expected_value           — scalar (threshold)
+ *   args[3] = cq_addr                  — scalar (auto-appended by deferred submit)
+ */
+
+#include <cstdint>
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+#include <pto/pto-inst.hpp>
+#include "tensor.h"
+#include "pto_cq_kernel_api.h"
+#include "pto_notify_kernel_api.h"
+
+extern "C" __aicore__ __attribute__((always_inline))
+void kernel_entry(__gm__ int64_t* args) {
+    // args[0] = dummy_notify tensor (output, unused by kernel)
+    uint64_t notify_counter_addr = static_cast<uint64_t>(args[1]);
+    uint32_t expected_value = static_cast<uint32_t>(args[2]);
+    uint64_t cq_addr = static_cast<uint64_t>(args[3]);
+
+    volatile __gm__ PTO2CompletionQueue* cq = pto2_cq_get(cq_addr);
+    pto2_cq_reset(cq);
+    pto2_save_expected_notification_counter(
+        cq,
+        reinterpret_cast<volatile __gm__ int32_t*>(static_cast<uintptr_t>(notify_counter_addr)),
+        expected_value);
+    dcci((__gm__ int32_t*)cq, cache_line_t::ENTIRE_DATA_CACHE, dcci_dst_t::CACHELINE_OUT);
+    dsb(DSB_DDR);
+    pipe_barrier(PIPE_ALL);
+}
@@ -46,6 +46,7 @@
     {"func_id": 0, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_prepare.cpp"),       "core_type": "aiv"},
     {"func_id": 1, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_send_data.cpp"),     "core_type": "aiv"},
     {"func_id": 2, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_recv_assemble.cpp"), "core_type": "aiv"},
+    {"func_id": 3, "source": str(_KERNELS_ROOT / "aiv" / "kernel_notify_wait.cpp"),       "core_type": "aiv"},
 ]
 
 RUNTIME_CONFIG = {
 
@@ -1,5 +1,5 @@
 /**
- * MOE Dispatch V2 Orchestration — 8-rank, 3-phase task DAG
+ * MOE Dispatch V2 Orchestration — 8-rank, 4-phase task DAG
  *
  * Task DAG per rank:
  *
@@ -13,10 +13,16 @@
  *       |                       7 × TPUT_ASYNC counts → peer recv_counts
  *       |                       7 × TNOTIFY → peer notify_counter
  *       |
- *       +-- local_counts --> Phase 2: RecvAssemble (func_id=2, launch-gated)
- *                              IN:  local_counts
- *                              OUT: expand_x, expert_token_nums
- *                              Reads shmem_data + recv_counts after 7 notifications
+ *       +-- local_counts --+
+ *                          |
+ *       Phase 1.5: NotifyWait (func_id=3, deferred CQ)
+ *         OUT: dummy_notify (dependency token)
+ *         Waits for notify_counter >= NUM_RANKS-1 via CQ poll
+ *                          |
+ *       Phase 2: RecvAssemble (func_id=2, RTC)
+ *         IN:  local_counts, dummy_notify
+ *         OUT: expand_x, expert_token_nums
+ *         Reads shmem_data + recv_counts after NotifyWait completes
  *
  * args layout (from DISTRIBUTED_CONFIG):
  *   [0]  = tokens            (window, float*)
@@ -97,8 +103,9 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
     Tensor ext_etn          = make_tensor_external(etn_ptr, etn_shape, 1, DataType::INT32);
 
     uint64_t sdma_context = pto2_rt_get_sdma_context();
-    uint64_t cq = pto2_rt_alloc_cq();
-    if (sdma_context == 0 || cq == 0) {
+    uint64_t cq_send = pto2_rt_alloc_cq();
+    uint64_t cq_notify = pto2_rt_alloc_cq();
+    if (sdma_context == 0 || cq_send == 0 || cq_notify == 0) {
         LOG_ERROR("moe_dispatch_v2: rank %d failed SDMA context or CQ alloc", my_rank);
         return;
     }
@@ -123,20 +130,31 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
     params_send.add_scalar(notify_counter_addr);
     params_send.add_scalar((uint64_t)(uintptr_t)comm_ctx);
     params_send.add_scalar(sdma_context);
-    pto2_rt_submit_aiv_task_deferred(1, params_send, cq);
+    pto2_rt_submit_aiv_task_deferred(1, params_send, cq_send);
 
-    // Phase 2: RecvAssemble (launch-gated on 7 notifications)
+    // Phase 1.5: NotifyWait — deferred task that waits for notification counter.
+    // Produces a dummy_notify tensor so RecvAssemble can depend on it via TensorMap.
+    uint32_t dummy_shape[1] = { 1 };
+    Tensor dummy_notify = make_tensor(dummy_shape, 1, DataType::INT32);
+
+    PTOParam params_wait;
+    params_wait.add_output(dummy_notify);
+    params_wait.add_scalar(notify_counter_addr);
+    params_wait.add_scalar((uint64_t)(NUM_RANKS - 1));
+    pto2_rt_submit_aiv_task_deferred(3, params_wait, cq_notify);
+
+    // Phase 2: RecvAssemble (depends on NotifyWait via dummy_notify)
     PTOParam params_recv;
+    params_recv.add_input(dummy_notify);
     params_recv.add_input(ext_local_counts);
     params_recv.add_output(ext_expand_x);
     params_recv.add_output(ext_etn);
     params_recv.add_scalar(shmem_data_addr);
     params_recv.add_scalar(recv_counts_addr);
     params_recv.add_scalar((uint64_t)(uintptr_t)comm_ctx);
-    pto2_rt_expect_notification_counter(params_recv, notify_counter_addr, NUM_RANKS - 1);
     pto2_rt_submit_aiv_task(2, params_recv);
 
-    LOG_INFO("moe_dispatch_v2: rank %d submitted 3-phase DAG (8-rank, expect %d notifs)",
+    LOG_INFO("moe_dispatch_v2: rank %d submitted 4-phase DAG (8-rank, expect %d notifs)",
              my_rank, NUM_RANKS - 1);
 }
 
 
@@ -1116,8 +1116,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
         uint64_t _t0_phase = _t0;
 #endif
         int32_t task_count = 0;
-        if (!tracker.has_any_running_cores() && async_wait_list.count == 0
-            && rt->scheduler.notification_wait_list.get_count() == 0) {
+        if (!tracker.has_any_running_cores() && async_wait_list.count == 0) {
             bool orch_done = orchestrator_done_;
             if (orch_done) {
                 // Check for orchestrator fatal error — exit immediately
@@ -1204,18 +1203,6 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
             }
         }
 
-        // Phase 0b: Poll notification counter conditions (pre-launch gating)
-        // Only one thread polls at a time to avoid double-enqueue races.
-        if (rt->scheduler.notification_wait_list.get_count() > 0 &&
-            rt->scheduler.notification_wait_list.try_lock_poll()) {
-            int32_t enqueued = rt->scheduler.notification_wait_list.poll_and_enqueue(
-                &rt->scheduler, local_bufs);
-            rt->scheduler.notification_wait_list.unlock_poll();
-            if (enqueued > 0) {
-                made_progress = true;
-            }
-        }
-
         // Phase 1: Check running cores for completion, process and move to idle
         int32_t completed_this_turn = async_completed_this_turn;
         bool fatal_error = false;
@@ -1457,7 +1444,6 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                 DEV_ALWAYS("PTO2 stall: no progress for %d iterations, completed=%d total=%d (last progress at %d)",
                            idle_iterations, c, task_count, last_progress_count);
                 async_wait_list.dump(thread_idx, STALL_DUMP_WAIT_MAX);
-                rt->scheduler.notification_wait_list.dump(thread_idx, STALL_DUMP_WAIT_MAX);
                 // Scan all task slots to find truly stuck tasks using scheduler state
                 PTO2SchedulerState* sched = &rt->scheduler;
                 PTO2SharedMemoryHeader* sm_header_diag = static_cast<PTO2SharedMemoryHeader*>(sm_base);
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@`
`46`	`46`	`{"func_id": 0, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_prepare.cpp"), "core_type": "aiv"},`
`47`	`47`	`{"func_id": 1, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_send_data.cpp"), "core_type": "aiv"},`
`48`	`48`	`{"func_id": 2, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_recv_assemble.cpp"), "core_type": "aiv"},`
	`49`	`+ {"func_id": 3, "source": str(_KERNELS_ROOT / "aiv" / "kernel_notify_wait.cpp"), "core_type": "aiv"},`
`49`	`50`	`]`
`50`	`51`
`51`	`52`	`RUNTIME_CONFIG = {`