Skip to content

Commit 54c8d44

Browse files
author
echo_stone
committed
Enhance async_notify_demo with NotifyWait kernel and orchestration updates
- Updated the async_notify_demo to include a new NotifyWait kernel that registers a notification counter condition for inter-rank synchronization. - Modified the consumer kernel to depend on the completion of NotifyWait, ensuring it only executes when the notification counter is satisfied. - Enhanced orchestration logic to incorporate the NotifyWait phase, allowing for a more robust task dependency management. - Refactored kernel argument layouts to accommodate the new dependency token from NotifyWait. - Improved runtime handling by removing legacy notification wait mechanisms, streamlining the completion process.
1 parent f6bb0b2 commit 54c8d44

16 files changed

Lines changed: 208 additions & 227 deletions

File tree

examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/aiv/kernel_consumer.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
*
44
* Implements: result[i] = src[i] + notify_counter[0]
55
*
6-
* This kernel is launch-gated: the scheduler only promotes it to READY after
7-
* both its fanin (producer complete) AND local notification counter >= 1.
6+
* Depends on NotifyWait completing (via dummy tensor), guaranteeing
7+
* the local notification counter >= 1 before this kernel runs.
88
*
99
* Kernel args layout (packed by scheduler):
10-
* args[0] = &Tensor(src) — input tensor struct pointer (producer's output)
11-
* args[1] = &Tensor(result) — output tensor struct pointer
12-
* args[2] = notify_counter_addr — local notify counter (window memory)
10+
* args[0] = &Tensor(dummy_notify) — input (dependency token from NotifyWait)
11+
* args[1] = &Tensor(src) — input tensor struct pointer (producer's output)
12+
* args[2] = &Tensor(result) — output tensor struct pointer
13+
* args[3] = notify_counter_addr — local notify counter (window memory)
1314
*/
1415

1516
#include <cstdint>
@@ -28,9 +29,10 @@ using namespace pto;
2829
#endif
2930

3031
extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t* args) {
31-
__gm__ Tensor* src_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
32-
__gm__ Tensor* result_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
33-
__gm__ int32_t* notify_counter = reinterpret_cast<__gm__ int32_t*>(args[2]);
32+
// args[0] = dummy_notify tensor (dependency token, unused)
33+
__gm__ Tensor* src_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
34+
__gm__ Tensor* result_tensor = reinterpret_cast<__gm__ Tensor*>(args[2]);
35+
__gm__ int32_t* notify_counter = reinterpret_cast<__gm__ int32_t*>(args[3]);
3436

3537
__gm__ float* src =
3638
reinterpret_cast<__gm__ float*>(src_tensor->buffer.addr) + src_tensor->start_offset;
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/**
2+
* NotifyWait Kernel — register notification counter as CQ condition (func_id=2)
3+
*
4+
* Trivial deferred-completion kernel: registers a COUNTER wait condition
5+
* for the notification counter, then returns immediately. The scheduler
6+
* polls the counter via the CQ mechanism and completes this task once
7+
* *notify_counter >= expected_value.
8+
*
9+
* Kernel args layout:
10+
* args[0] = &Tensor(dummy_notify) — output (dependency token for downstream)
11+
* args[1] = notify_counter_addr — scalar (GM int32* to poll)
12+
* args[2] = expected_value — scalar (threshold)
13+
* args[3] = cq_addr — scalar (auto-appended by deferred submit)
14+
*/
15+
16+
#include <cstdint>
17+
18+
#ifndef __gm__
19+
#define __gm__
20+
#endif
21+
22+
#ifndef __aicore__
23+
#define __aicore__ [aicore]
24+
#endif
25+
26+
#include <pto/pto-inst.hpp>
27+
#include "tensor.h"
28+
#include "pto_cq_kernel_api.h"
29+
30+
extern "C" __aicore__ __attribute__((always_inline))
31+
void kernel_entry(__gm__ int64_t* args) {
32+
uint64_t notify_counter_addr = static_cast<uint64_t>(args[1]);
33+
uint32_t expected_value = static_cast<uint32_t>(args[2]);
34+
uint64_t cq_addr = static_cast<uint64_t>(args[3]);
35+
36+
volatile __gm__ PTO2CompletionQueue* cq = pto2_cq_get(cq_addr);
37+
pto2_cq_reset(cq);
38+
pto2_save_expected_completion(PTO2_ENGINE_SDMA, cq,
39+
notify_counter_addr, expected_value);
40+
// Flush CQ writes from AICore data cache to GM so the AICPU scheduler
41+
// can read them. pto2_cq_flush's #if-defined guards don't fire because
42+
// the constants are C++ enums, not macros — call intrinsics directly.
43+
dcci((__gm__ int32_t*)cq, cache_line_t::ENTIRE_DATA_CACHE, dcci_dst_t::CACHELINE_OUT);
44+
dsb(DSB_DDR);
45+
pipe_barrier(PIPE_ALL);
46+
}

examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/kernel_config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
Async Notify Demo - Kernel and Orchestration Configuration
33
44
Two hardware cards use TNOTIFY(AtomicAdd) for inter-rank notification.
5-
The consumer is launch-gated on the local notification counter >= 1.
5+
The consumer depends on a deferred NotifyWait task that polls the
6+
local notification counter >= 1 via the CQ mechanism.
67
"""
78

89
import os
@@ -22,6 +23,7 @@
2223
KERNELS = [
2324
{"func_id": 0, "source": str(_KERNELS_ROOT / "aiv" / "kernel_producer_notify.cpp"), "core_type": "aiv"},
2425
{"func_id": 1, "source": str(_KERNELS_ROOT / "aiv" / "kernel_consumer.cpp"), "core_type": "aiv"},
26+
{"func_id": 2, "source": str(_KERNELS_ROOT / "aiv" / "kernel_notify_wait.cpp"), "core_type": "aiv"},
2527
]
2628

2729
RUNTIME_CONFIG = {

examples/a2a3/tensormap_and_ringbuffer/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
* Async Notify Demo - Device-side orchestration
33
*
44
* Two-card hardware mode:
5-
* t0 (producer): out = in * 2, then TNOTIFY(AtomicAdd) the peer's window
6-
* counter. Completes normally (no deferred completion).
7-
* t1 (consumer, launch-gated): result = out + notify_counter.
8-
* Gated by local notification counter >= 1.
9-
* The scheduler only promotes this task to READY after both
10-
* its fanin is satisfied AND the local counter reaches 1.
5+
* t0 (producer, func_id=0): out = in * 2, then TNOTIFY(AtomicAdd) the
6+
* peer's window counter. Completes normally (RTC).
7+
* t1 (notify_wait, func_id=2, deferred): registers notification counter
8+
* condition (counter >= 1) via CQ, returns immediately.
9+
* Produces dummy_notify tensor for dependency chain.
10+
* t2 (consumer, func_id=1): result = out + notify_counter.
11+
* Depends on both producer (via ext_out) and notify_wait
12+
* (via dummy_notify), ensuring counter >= 1 before reading.
1113
*
1214
* The notify counter is pre-zeroed by the distributed runner input loader.
1315
*/
@@ -50,6 +52,12 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
5052
Tensor ext_out = make_tensor_external(out_ptr, shapes, 1, DataType::FLOAT32);
5153
Tensor ext_result = make_tensor_external(result_ptr, shapes, 1, DataType::FLOAT32);
5254

55+
uint64_t cq_notify = pto2_rt_alloc_cq();
56+
if (cq_notify == 0) {
57+
LOG_ERROR("async_notify_demo: rank %d failed CQ alloc", my_rank);
58+
return;
59+
}
60+
5361
// Producer: normal run-to-completion task (sends TNOTIFY to peer)
5462
PTOParam params_producer;
5563
params_producer.add_input(ext_in);
@@ -58,18 +66,27 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
5866
params_producer.add_scalar((uint64_t)(uintptr_t)comm_ctx);
5967
pto2_rt_submit_aiv_task(0, params_producer);
6068

61-
// Consumer: launch-gated by local notification counter.
62-
// After fanin (producer complete) is satisfied, the scheduler still holds
63-
// this task in PTO2NotificationWaitList until *notify_counter >= 1.
69+
// NotifyWait: deferred task that waits for notification counter >= 1.
70+
// Produces dummy_notify so the consumer can depend on it via TensorMap.
71+
uint32_t dummy_shape[1] = { 1 };
72+
Tensor dummy_notify = make_tensor(dummy_shape, 1, DataType::INT32);
73+
74+
PTOParam params_wait;
75+
params_wait.add_output(dummy_notify);
76+
params_wait.add_scalar((uint64_t)(uintptr_t)notify_counter_ptr);
77+
params_wait.add_scalar((uint64_t)1);
78+
pto2_rt_submit_aiv_task_deferred(2, params_wait, cq_notify);
79+
80+
// Consumer: depends on producer (via ext_out) and notify_wait (via dummy_notify).
81+
// Guaranteed notify_counter >= 1 when this task runs.
6482
PTOParam params_consumer;
83+
params_consumer.add_input(dummy_notify);
6584
params_consumer.add_input(ext_out);
6685
params_consumer.add_output(ext_result);
6786
params_consumer.add_scalar((uint64_t)(uintptr_t)notify_counter_ptr);
68-
pto2_rt_expect_notification_counter(params_consumer,
69-
(uint64_t)(uintptr_t)notify_counter_ptr, 1);
7087
pto2_rt_submit_aiv_task(1, params_consumer);
7188

72-
LOG_INFO("async_notify_demo: rank %d producer=normal, consumer gated on counter=0x%lx",
89+
LOG_INFO("async_notify_demo: rank %d producer=RTC, notify_wait=deferred(counter=0x%lx), consumer=RTC",
7390
my_rank, (uint64_t)(uintptr_t)notify_counter_ptr);
7491
}
7592

examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/aiv/kernel_moe_recv_assemble.cpp

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
/**
2-
* MOE RecvAssemble Kernel — cumsum + assemble expandX (func_id=3)
2+
* MOE RecvAssemble Kernel — cumsum + assemble expandX (func_id=2)
33
*
4-
* Launch-gated on notification counter >= NUM_RANKS-1 (7 peers).
4+
* Depends on NotifyWait completing (via dummy tensor input),
5+
* guaranteeing notify_counter >= NUM_RANKS-1 (7 peers done).
56
*
67
* Reads local_counts + per-source-rank recv_counts, computes cumulative
78
* sums for assembly offsets, copies token data from shmem_data slots
@@ -13,12 +14,13 @@
1314
* = recv_counts[src_rank * COUNT_PAD + expert_offset] otherwise
1415
*
1516
* Kernel args layout:
16-
* args[0] = &Tensor(local_counts) — input [COUNT_PAD] int32
17-
* args[1] = &Tensor(expand_x) — output [EXPAND_X_ROWS * HIDDEN_DIM] float
18-
* args[2] = &Tensor(expert_token_nums) — output [EXPERTS_PER_RANK] int32
19-
* args[3] = shmem_data_addr — scalar (GM float* base)
20-
* args[4] = recv_counts_addr — scalar (GM int32*, [NUM_RANKS * COUNT_PAD])
21-
* args[5] = CommDeviceContext* — scalar
17+
* args[0] = &Tensor(dummy_notify) — input (dependency token from NotifyWait)
18+
* args[1] = &Tensor(local_counts) — input [COUNT_PAD] int32
19+
* args[2] = &Tensor(expand_x) — output [EXPAND_X_ROWS * HIDDEN_DIM] float
20+
* args[3] = &Tensor(expert_token_nums) — output [EXPERTS_PER_RANK] int32
21+
* args[4] = shmem_data_addr — scalar (GM float* base)
22+
* args[5] = recv_counts_addr — scalar (GM int32*, [NUM_RANKS * COUNT_PAD])
23+
* args[6] = CommDeviceContext* — scalar
2224
*/
2325

2426
#include <cstdint>
@@ -43,16 +45,17 @@ static constexpr int COUNT_PAD = 32;
4345

4446
extern "C" __aicore__ __attribute__((always_inline))
4547
void kernel_entry(__gm__ int64_t* args) {
46-
__gm__ Tensor* local_cnt_t = reinterpret_cast<__gm__ Tensor*>(args[0]);
47-
__gm__ Tensor* expand_x_t = reinterpret_cast<__gm__ Tensor*>(args[1]);
48-
__gm__ Tensor* etn_t = reinterpret_cast<__gm__ Tensor*>(args[2]);
48+
// args[0] = dummy_notify tensor (dependency token, unused)
49+
__gm__ Tensor* local_cnt_t = reinterpret_cast<__gm__ Tensor*>(args[1]);
50+
__gm__ Tensor* expand_x_t = reinterpret_cast<__gm__ Tensor*>(args[2]);
51+
__gm__ Tensor* etn_t = reinterpret_cast<__gm__ Tensor*>(args[3]);
4952

5053
__gm__ float* shmem_data =
51-
reinterpret_cast<__gm__ float*>(static_cast<uintptr_t>(args[3]));
54+
reinterpret_cast<__gm__ float*>(static_cast<uintptr_t>(args[4]));
5255
__gm__ int32_t* recv_counts =
53-
reinterpret_cast<__gm__ int32_t*>(static_cast<uintptr_t>(args[4]));
56+
reinterpret_cast<__gm__ int32_t*>(static_cast<uintptr_t>(args[5]));
5457
__gm__ CommDeviceContext* comm_ctx =
55-
reinterpret_cast<__gm__ CommDeviceContext*>(static_cast<uintptr_t>(args[5]));
58+
reinterpret_cast<__gm__ CommDeviceContext*>(static_cast<uintptr_t>(args[6]));
5659

5760
__gm__ int32_t* local_counts =
5861
reinterpret_cast<__gm__ int32_t*>(local_cnt_t->buffer.addr) + local_cnt_t->start_offset;
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/**
2+
* NotifyWait Kernel — register notification counter as CQ condition (func_id=3)
3+
*
4+
* Trivial deferred-completion kernel: registers a COUNTER wait condition
5+
* for the notification counter, then returns immediately. The scheduler
6+
* polls the counter via the CQ mechanism and completes this task once
7+
* *notify_counter >= expected_value.
8+
*
9+
* Kernel args layout:
10+
* args[0] = &Tensor(dummy_notify) — output (dependency token for downstream)
11+
* args[1] = notify_counter_addr — scalar (GM int32* to poll)
12+
* args[2] = expected_value — scalar (threshold)
13+
* args[3] = cq_addr — scalar (auto-appended by deferred submit)
14+
*/
15+
16+
#include <cstdint>
17+
18+
#ifndef __gm__
19+
#define __gm__
20+
#endif
21+
22+
#ifndef __aicore__
23+
#define __aicore__ [aicore]
24+
#endif
25+
26+
#include <pto/pto-inst.hpp>
27+
#include "tensor.h"
28+
#include "pto_cq_kernel_api.h"
29+
#include "pto_notify_kernel_api.h"
30+
31+
extern "C" __aicore__ __attribute__((always_inline))
32+
void kernel_entry(__gm__ int64_t* args) {
33+
// args[0] = dummy_notify tensor (output, unused by kernel)
34+
uint64_t notify_counter_addr = static_cast<uint64_t>(args[1]);
35+
uint32_t expected_value = static_cast<uint32_t>(args[2]);
36+
uint64_t cq_addr = static_cast<uint64_t>(args[3]);
37+
38+
volatile __gm__ PTO2CompletionQueue* cq = pto2_cq_get(cq_addr);
39+
pto2_cq_reset(cq);
40+
pto2_save_expected_notification_counter(
41+
cq,
42+
reinterpret_cast<volatile __gm__ int32_t*>(static_cast<uintptr_t>(notify_counter_addr)),
43+
expected_value);
44+
dcci((__gm__ int32_t*)cq, cache_line_t::ENTIRE_DATA_CACHE, dcci_dst_t::CACHELINE_OUT);
45+
dsb(DSB_DDR);
46+
pipe_barrier(PIPE_ALL);
47+
}

examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/kernel_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
{"func_id": 0, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_prepare.cpp"), "core_type": "aiv"},
4747
{"func_id": 1, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_send_data.cpp"), "core_type": "aiv"},
4848
{"func_id": 2, "source": str(_KERNELS_ROOT / "aiv" / "kernel_moe_recv_assemble.cpp"), "core_type": "aiv"},
49+
{"func_id": 3, "source": str(_KERNELS_ROOT / "aiv" / "kernel_notify_wait.cpp"), "core_type": "aiv"},
4950
]
5051

5152
RUNTIME_CONFIG = {

examples/a2a3/tensormap_and_ringbuffer/moe_dispatch/kernels/orchestration/moe_dispatch_orchestration.cpp

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* MOE Dispatch V2 Orchestration — 8-rank, 3-phase task DAG
2+
* MOE Dispatch V2 Orchestration — 8-rank, 4-phase task DAG
33
*
44
* Task DAG per rank:
55
*
@@ -13,10 +13,16 @@
1313
* | 7 × TPUT_ASYNC counts → peer recv_counts
1414
* | 7 × TNOTIFY → peer notify_counter
1515
* |
16-
* +-- local_counts --> Phase 2: RecvAssemble (func_id=2, launch-gated)
17-
* IN: local_counts
18-
* OUT: expand_x, expert_token_nums
19-
* Reads shmem_data + recv_counts after 7 notifications
16+
* +-- local_counts --+
17+
* |
18+
* Phase 1.5: NotifyWait (func_id=3, deferred CQ)
19+
* OUT: dummy_notify (dependency token)
20+
* Waits for notify_counter >= NUM_RANKS-1 via CQ poll
21+
* |
22+
* Phase 2: RecvAssemble (func_id=2, RTC)
23+
* IN: local_counts, dummy_notify
24+
* OUT: expand_x, expert_token_nums
25+
* Reads shmem_data + recv_counts after NotifyWait completes
2026
*
2127
* args layout (from DISTRIBUTED_CONFIG):
2228
* [0] = tokens (window, float*)
@@ -97,8 +103,9 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
97103
Tensor ext_etn = make_tensor_external(etn_ptr, etn_shape, 1, DataType::INT32);
98104

99105
uint64_t sdma_context = pto2_rt_get_sdma_context();
100-
uint64_t cq = pto2_rt_alloc_cq();
101-
if (sdma_context == 0 || cq == 0) {
106+
uint64_t cq_send = pto2_rt_alloc_cq();
107+
uint64_t cq_notify = pto2_rt_alloc_cq();
108+
if (sdma_context == 0 || cq_send == 0 || cq_notify == 0) {
102109
LOG_ERROR("moe_dispatch_v2: rank %d failed SDMA context or CQ alloc", my_rank);
103110
return;
104111
}
@@ -123,20 +130,31 @@ void aicpu_orchestration_entry(uint64_t* args, int arg_count,
123130
params_send.add_scalar(notify_counter_addr);
124131
params_send.add_scalar((uint64_t)(uintptr_t)comm_ctx);
125132
params_send.add_scalar(sdma_context);
126-
pto2_rt_submit_aiv_task_deferred(1, params_send, cq);
133+
pto2_rt_submit_aiv_task_deferred(1, params_send, cq_send);
127134

128-
// Phase 2: RecvAssemble (launch-gated on 7 notifications)
135+
// Phase 1.5: NotifyWait — deferred task that waits for notification counter.
136+
// Produces a dummy_notify tensor so RecvAssemble can depend on it via TensorMap.
137+
uint32_t dummy_shape[1] = { 1 };
138+
Tensor dummy_notify = make_tensor(dummy_shape, 1, DataType::INT32);
139+
140+
PTOParam params_wait;
141+
params_wait.add_output(dummy_notify);
142+
params_wait.add_scalar(notify_counter_addr);
143+
params_wait.add_scalar((uint64_t)(NUM_RANKS - 1));
144+
pto2_rt_submit_aiv_task_deferred(3, params_wait, cq_notify);
145+
146+
// Phase 2: RecvAssemble (depends on NotifyWait via dummy_notify)
129147
PTOParam params_recv;
148+
params_recv.add_input(dummy_notify);
130149
params_recv.add_input(ext_local_counts);
131150
params_recv.add_output(ext_expand_x);
132151
params_recv.add_output(ext_etn);
133152
params_recv.add_scalar(shmem_data_addr);
134153
params_recv.add_scalar(recv_counts_addr);
135154
params_recv.add_scalar((uint64_t)(uintptr_t)comm_ctx);
136-
pto2_rt_expect_notification_counter(params_recv, notify_counter_addr, NUM_RANKS - 1);
137155
pto2_rt_submit_aiv_task(2, params_recv);
138156

139-
LOG_INFO("moe_dispatch_v2: rank %d submitted 3-phase DAG (8-rank, expect %d notifs)",
157+
LOG_INFO("moe_dispatch_v2: rank %d submitted 4-phase DAG (8-rank, expect %d notifs)",
140158
my_rank, NUM_RANKS - 1);
141159
}
142160

src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,8 +1116,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
11161116
uint64_t _t0_phase = _t0;
11171117
#endif
11181118
int32_t task_count = 0;
1119-
if (!tracker.has_any_running_cores() && async_wait_list.count == 0
1120-
&& rt->scheduler.notification_wait_list.get_count() == 0) {
1119+
if (!tracker.has_any_running_cores() && async_wait_list.count == 0) {
11211120
bool orch_done = orchestrator_done_;
11221121
if (orch_done) {
11231122
// Check for orchestrator fatal error — exit immediately
@@ -1204,18 +1203,6 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
12041203
}
12051204
}
12061205

1207-
// Phase 0b: Poll notification counter conditions (pre-launch gating)
1208-
// Only one thread polls at a time to avoid double-enqueue races.
1209-
if (rt->scheduler.notification_wait_list.get_count() > 0 &&
1210-
rt->scheduler.notification_wait_list.try_lock_poll()) {
1211-
int32_t enqueued = rt->scheduler.notification_wait_list.poll_and_enqueue(
1212-
&rt->scheduler, local_bufs);
1213-
rt->scheduler.notification_wait_list.unlock_poll();
1214-
if (enqueued > 0) {
1215-
made_progress = true;
1216-
}
1217-
}
1218-
12191206
// Phase 1: Check running cores for completion, process and move to idle
12201207
int32_t completed_this_turn = async_completed_this_turn;
12211208
bool fatal_error = false;
@@ -1457,7 +1444,6 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
14571444
DEV_ALWAYS("PTO2 stall: no progress for %d iterations, completed=%d total=%d (last progress at %d)",
14581445
idle_iterations, c, task_count, last_progress_count);
14591446
async_wait_list.dump(thread_idx, STALL_DUMP_WAIT_MAX);
1460-
rt->scheduler.notification_wait_list.dump(thread_idx, STALL_DUMP_WAIT_MAX);
14611447
// Scan all task slots to find truly stuck tasks using scheduler state
14621448
PTO2SchedulerState* sched = &rt->scheduler;
14631449
PTO2SharedMemoryHeader* sm_header_diag = static_cast<PTO2SharedMemoryHeader*>(sm_base);

0 commit comments

Comments
 (0)