Fix(pr): address review comments for #334

echo_stone · echo_stone · commit 82000780e8e9 · 2026-03-31T17:08:45.000+08:00
- Remove preemptive flush_deferred_releases guard and unused lambda
  from executor loop; rely on existing inline flush-on-full and
  idle-batch-flush paths (reviewer: poursoul)
- Clarify cache_invalidate_range comment: all current counter writers
  (SDMA flags, TNOTIFY RDMA atomics) bypass AICPU cache, so
  invalidation is always required (reviewer: uv-xiao)
- Add pto2_rt_submit_notification_wait_task() helper API to
  pto_orchestration_api.h, reducing NotifyWait boilerplate in
  orchestration code (reviewer: uv-xiao)
- Simplify async_notify_demo and moe_dispatch orchestration to use
  the new helper API
- Remove unused PTO2LocalReadyBuffer forward declaration (reviewer:
  uv-xiao)

Made-with: Cursor
diff --git a/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp b/examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp
@@ -52,12 +52,6 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
     Tensor ext_out = make_tensor_external(out_ptr, shapes, 1, DataType::FLOAT32);
     Tensor ext_result = make_tensor_external(result_ptr, shapes, 1, DataType::FLOAT32);
 
-    uint64_t cq_notify = pto2_rt_alloc_cq();
-    if (cq_notify == 0) {
-        LOG_ERROR("async_notify_demo: rank %d failed CQ alloc", my_rank);
-        return;
-    }
-
     // Producer: normal run-to-completion task (sends TNOTIFY to peer)
     PTOParam params_producer;
     params_producer.add_input(ext_in);
@@ -67,20 +61,13 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
     pto2_rt_submit_aiv_task(0, params_producer);
 
     // NotifyWait: deferred task that waits for notification counter >= 1.
-    // Produces dummy_notify so the consumer can depend on it via TensorMap.
-    uint32_t dummy_shape[1] = { 1 };
-    Tensor dummy_notify = make_tensor(dummy_shape, 1, DataType::INT32);
-
-    PTOParam params_wait;
-    params_wait.add_output(dummy_notify);
-    params_wait.add_scalar((uint64_t)(uintptr_t)notify_counter_ptr);
-    params_wait.add_scalar((uint64_t)1);
-    pto2_rt_submit_aiv_task_deferred(2, params_wait, cq_notify);
+    // Returns a dependency token tensor for downstream tasks.
+    Tensor notify_token = pto2_rt_submit_notification_wait_task(
+        2, (uint64_t)(uintptr_t)notify_counter_ptr, 1);
 
-    // Consumer: depends on producer (via ext_out) and notify_wait (via dummy_notify).
-    // Guaranteed notify_counter >= 1 when this task runs.
+    // Consumer: depends on producer (via ext_out) and notify_wait (via token).
     PTOParam params_consumer;
-    params_consumer.add_input(dummy_notify);
+    params_consumer.add_input(notify_token);
     params_consumer.add_input(ext_out);
     params_consumer.add_output(ext_result);
     params_consumer.add_scalar((uint64_t)(uintptr_t)notify_counter_ptr);
diff --git a/examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/orchestration/moe_dispatch_orchestration.cpp b/examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/orchestration/moe_dispatch_orchestration.cpp
@@ -104,8 +104,7 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
 
     uint64_t sdma_context = pto2_rt_get_sdma_context();
     uint64_t cq_send = pto2_rt_alloc_cq();
-    uint64_t cq_notify = pto2_rt_alloc_cq();
-    if (sdma_context == 0 || cq_send == 0 || cq_notify == 0) {
+    if (sdma_context == 0 || cq_send == 0) {
         LOG_ERROR("moe_dispatch_v2: rank %d failed SDMA context or CQ alloc", my_rank);
         return;
     }
@@ -132,20 +131,14 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
     params_send.add_scalar(sdma_context);
     pto2_rt_submit_aiv_task_deferred(1, params_send, cq_send);
 
-    // Phase 1.5: NotifyWait — deferred task that waits for notification counter.
-    // Produces a dummy_notify tensor so RecvAssemble can depend on it via TensorMap.
-    uint32_t dummy_shape[1] = { 1 };
-    Tensor dummy_notify = make_tensor(dummy_shape, 1, DataType::INT32);
+    // Phase 1.5: NotifyWait — deferred wait for notification counter >= NUM_RANKS-1.
+    // Returns a dependency token for RecvAssemble via TensorMap.
+    Tensor notify_token = pto2_rt_submit_notification_wait_task(
+        3, notify_counter_addr, NUM_RANKS - 1);
 
-    PTOParam params_wait;
-    params_wait.add_output(dummy_notify);
-    params_wait.add_scalar(notify_counter_addr);
-    params_wait.add_scalar((uint64_t)(NUM_RANKS - 1));
-    pto2_rt_submit_aiv_task_deferred(3, params_wait, cq_notify);
-
-    // Phase 2: RecvAssemble (depends on NotifyWait via dummy_notify)
+    // Phase 2: RecvAssemble (depends on NotifyWait via notify_token)
     PTOParam params_recv;
-    params_recv.add_input(dummy_notify);
+    params_recv.add_input(notify_token);
     params_recv.add_input(ext_local_counts);
     params_recv.add_output(ext_expand_x);
     params_recv.add_output(ext_etn);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -1083,25 +1083,6 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
 
     PTO2AsyncWaitList async_wait_list;
 
-    auto flush_deferred_releases = [&]() {
-        while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-            int32_t fe = rt->scheduler.on_task_release(
-                *deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-            int32_t fe = rt->scheduler.on_task_release(
-                *deferred_release_slot_states[--deferred_release_count]);
-#endif
-            (void)fe;
-#if PTO2_SCHED_PROFILING
-            fanin_edges_total += fe;
-            if (fe > fanin_max_degree) {
-                fanin_max_degree = fe;
-            }
-#endif
-        }
-    };
-
     bool cores_released = false;
 
 #if PTO2_PROFILING
@@ -1172,9 +1153,6 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
 
         // Phase 0: Poll async completion conditions (deferred-completion tasks)
         int32_t async_completed_this_turn = 0;
-        if (deferred_release_count > MAX_DEFERRED_RELEASES - PTO2_MAX_ASYNC_WAITS) {
-            flush_deferred_releases();
-        }
         if (async_wait_list.count > 0) {
             PTO2AsyncPollResult poll_result = async_wait_list.poll_and_complete<false>(
                 &rt->scheduler, local_bufs,
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -253,6 +253,40 @@ static inline void pto2_rt_submit_task_deferred(const MixedKernels& mixed_kernel
     rt->ops->submit_task(rt, mixed_kernels, params);
 }
 
+/**
+ * Submit a notification-wait deferred task and return a dependency token.
+ *
+ * Encapsulates the boilerplate for creating a NotifyWait task:
+ *   1. Allocate a CQ
+ *   2. Create a 1-element dummy output tensor (dependency token)
+ *   3. Submit a deferred AIV task with (counter_addr, expected_value, cq_addr)
+ *
+ * The returned token tensor should be added as an input to any downstream
+ * task that depends on the notification completing.
+ *
+ * @param kernel_id      func_id of the NotifyWait kernel
+ * @param counter_addr   GM address of the notification counter (int32*)
+ * @param expected_value  threshold: task completes when *counter >= expected
+ * @return dependency token tensor (add as input to downstream tasks)
+ */
+static inline Tensor pto2_rt_submit_notification_wait_task(
+    int32_t kernel_id,
+    uint64_t counter_addr,
+    uint32_t expected_value) {
+    uint64_t cq_addr = pto2_rt_alloc_cq();
+
+    uint32_t dummy_shape[1] = { 1 };
+    Tensor token = make_tensor(dummy_shape, 1, DataType::INT32);
+
+    PTOParam params;
+    params.add_output(token);
+    params.add_scalar(counter_addr);
+    params.add_scalar(static_cast<uint64_t>(expected_value));
+    pto2_rt_submit_aiv_task_deferred(kernel_id, params, cq_addr);
+
+    return token;
+}
+
 static inline void pto2_rt_scope_begin() {
     PTO2Runtime* rt = pto2_current_runtime();
     rt->ops->scope_begin(rt);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
@@ -185,8 +185,10 @@ struct PTO2AsyncWaitList {
             for (int32_t c = 0; c < entry.condition_count; c++) {
                 PTO2CompletionCondition& cond = entry.conditions[c];
                 if (!cond.satisfied) {
-                    // RDMA-written counters (e.g. TNOTIFY) bypass AICPU data cache.
-                    // Invalidate before reading to see the true memory value.
+                    // All current counter writers (SDMA engine flags, TNOTIFY
+                    // RDMA atomics) bypass AICPU data cache.  Invalidation is
+                    // needed so the poll reads the true GM value.  For any
+                    // hypothetical CPU-written counter this is a harmless no-op.
                     if (cond.counter_addr) {
                         cache_invalidate_range(
                             reinterpret_cast<const void*>(const_cast<const uint32_t*>(cond.counter_addr)),
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -28,7 +28,6 @@
 #include "common/core_type.h"
 
 struct PTO2SchedulerState;
-struct PTO2LocalReadyBuffer;
 
 #if PTO2_SCHED_PROFILING
 #include "aicpu/device_time.h"