@@ -143,7 +143,6 @@ bool pto2_orchestrator_init(
143143 pto2_dep_pool_init (&orch->rings [r].dep_pool , dep_entries, dep_pool_capacity);
144144 orch->rings [r].dep_pool .error_code_ptr = &sm_handle->header ->orch_error_code ;
145145 orch->dep_pool_cur_entries [r] = nullptr ;
146- orch->dep_pool_last_reclaimed [r] = 0 ;
147146 }
148147
149148 // Initialize TensorMap with per-ring task window sizes
@@ -158,9 +157,6 @@ bool pto2_orchestrator_init(
158157 return false ;
159158 }
160159 orch->tensor_map .orch = orch;
161- for (int r = 0 ; r < PTO2_MAX_RING_DEPTH; r++) {
162- orch->tensormap_last_cleanup [r] = 0 ;
163- }
164160
165161 // Initialize scope stack: one flat buffer for task IDs + one array for begin offsets
166162 uint64_t max_depth = PTO2_MAX_SCOPE_DEPTH;
@@ -203,40 +199,18 @@ void pto2_orchestrator_set_scheduler(PTO2OrchestratorState* orch, PTO2SchedulerS
203199}
204200
205201
206- // =============================================================================
207- // Dep Pool Reclamation
208- // =============================================================================
209-
210- /* *
211- * Reclaim dead dep pool entries for a specific ring based on scheduler's last_task_alive.
212- * Safe to call multiple times — only advances tail forward.
213- */
214- static void pto2_dep_pool_reclaim (PTO2OrchestratorState* orch, int32_t ring_id) {
215- int32_t last_alive =
216- orch->sm_handle ->header ->rings [ring_id].fc .last_task_alive .load (std::memory_order_acquire);
217- if (last_alive > orch->dep_pool_last_reclaimed [ring_id] && last_alive > 0 ) {
218- int32_t newest_consumed = last_alive - 1 ;
219- int32_t slot_rc = orch->rings [ring_id].task_ring .get_task_slot (newest_consumed);
220- int32_t mark = orch->sm_handle ->task_payloads [ring_id][slot_rc].dep_pool_mark ;
221- if (mark > 0 ) {
222- orch->rings [ring_id].dep_pool .advance_tail (mark);
223- }
224- orch->dep_pool_last_reclaimed [ring_id] = last_alive;
225- }
226- }
227-
228202/* *
229203 * Ensure dep pool for a specific ring has at least `needed` entries available.
230204 * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
231205 */
232- static void pto2_dep_pool_ensure_space (PTO2OrchestratorState* orch, int32_t ring_id, int32_t needed) {
206+ static void pto2_dep_pool_ensure_space (PTO2OrchestratorState* orch, uint8_t ring_id, int32_t needed) {
233207 if (pto2_dep_pool_available (&orch->rings [ring_id].dep_pool ) >= needed) return ;
234208
235209 int spin_count = 0 ;
236210 int32_t prev_last_alive =
237211 orch->sm_handle ->header ->rings [ring_id].fc .last_task_alive .load (std::memory_order_acquire);
238212 while (pto2_dep_pool_available (&orch->rings [ring_id].dep_pool ) < needed) {
239- pto2_dep_pool_reclaim (orch, ring_id);
213+ orch-> rings [ring_id]. dep_pool . reclaim (orch-> scheduler , ring_id, prev_last_alive );
240214 if (pto2_dep_pool_available (&orch->rings [ring_id].dep_pool ) >= needed) return ;
241215
242216 spin_count++;
@@ -334,7 +308,11 @@ void pto2_scope_end(PTO2OrchestratorState* orch) {
334308void pto2_submit_mixed_task (
335309 PTO2OrchestratorState* orch, const MixedKernels& mixed_kernels, const PTOParam& params) {
336310 // Fast path after fatal error — all subsequent submits are no-ops
337- if (orch->fatal ) { return ; }
311+ if (orch->fatal ) {
312+ return ;
313+ }
314+
315+ PTO2SchedulerState* sched = orch->scheduler ;
338316
339317 // Validate PTOParam construction (errors recorded by add_input/add_output/etc.)
340318 if (params.has_error ) {
@@ -370,14 +348,20 @@ void pto2_submit_mixed_task(
370348 }
371349
372350 // === STEP 0: Sync TensorMap validity and optional cleanup ===
373- orch->tensor_map .sync_tensormap ();
374351
375352 // Determine which ring this task belongs to
376- int32_t ring_id = orch->current_ring_id ();
353+ uint8_t ring_id = orch->current_ring_id ();
377354 auto & task_ring = orch->rings [ring_id].task_ring ;
378355
379- // Reclaim dead dep pool entries based on scheduler's last_task_alive
380- pto2_dep_pool_reclaim (orch, ring_id);
356+ // Read current last_task_alive from shared memory for this ring
357+ int32_t sm_last_task_alive =
358+ orch->sm_handle ->header ->rings [ring_id].fc .last_task_alive .load (std::memory_order_acquire);
359+
360+ orch->tensor_map .sync_tensormap (ring_id, sm_last_task_alive);
361+
362+ if (sched) {
363+ orch->rings [ring_id].dep_pool .reclaim (sched, ring_id, sm_last_task_alive);
364+ }
381365
382366 CYCLE_COUNT_LAP_RECORD (g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, -1 );
383367
@@ -427,8 +411,7 @@ void pto2_submit_mixed_task(
427411 int32_t local_id = task_ring.pto2_task_ring_alloc ();
428412 if (local_id < 0 ) { orch->fatal = true ; return ; }
429413 int32_t slot = task_ring.get_task_slot (local_id);
430- PTO2TaskId mixed_task_id =
431- pto2_make_task_id (static_cast <uint8_t >(ring_id), static_cast <uint32_t >(local_id));
414+ PTO2TaskId mixed_task_id = pto2_make_task_id (ring_id, static_cast <uint32_t >(local_id));
432415
433416 PTO2TaskDescriptor& task = task_ring.get_task_by_slot (slot);
434417 PTO2TaskPayload* payload = &orch->sm_handle ->task_payloads [ring_id][slot];
@@ -444,21 +427,11 @@ void pto2_submit_mixed_task(
444427 for (int32_t j = 0 ; j < params.scalar_count ; j += 8 ) {
445428 __builtin_prefetch (&payload->scalars [j], 1 , 3 );
446429 }
447- // Metadata area: tensor_count, scalar_count, fanin_slot_states[] — all in first 3 CLs
448430 __builtin_prefetch (payload, 1 , 3 );
449431 __builtin_prefetch (reinterpret_cast <char *>(payload) + 64 , 1 , 3 );
450432 __builtin_prefetch (reinterpret_cast <char *>(payload) + 128 , 1 , 3 );
451433
452- // Initialize mixed-task descriptor
453- task.mixed_task_id = mixed_task_id;
454- task.kernel_id [static_cast <int >(PTO2SubtaskSlot::AIC)] = normalized.aic_kernel_id ;
455- task.kernel_id [static_cast <int >(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id ;
456- task.kernel_id [static_cast <int >(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id ;
457- task.packed_buffer_base = NULL ;
458- task.packed_buffer_end = NULL ;
459-
460434 // Initialize slot state (scheduler-private)
461- PTO2SchedulerState* sched = orch->scheduler ;
462435 if (sched) {
463436 auto & rs = sched->ring_sched_states [ring_id];
464437 PTO2TaskSlotState& slot_state = rs.get_slot_state_by_slot (slot);
@@ -473,7 +446,7 @@ void pto2_submit_mixed_task(
473446 slot_state.task = &task;
474447 slot_state.active_mask = active_mask;
475448 slot_state.subtask_done_mask .store (0 , std::memory_order_relaxed);
476- slot_state.ring_id = static_cast < uint8_t >( ring_id) ;
449+ slot_state.ring_id = ring_id;
477450 scope_tasks_push (orch, &slot_state);
478451 } else {
479452 scope_tasks_push (orch, nullptr );
@@ -496,10 +469,12 @@ void pto2_submit_mixed_task(
496469 }
497470 }
498471
472+ void * local_packed_base = nullptr ;
473+ void * local_packed_end = nullptr ;
499474 if (total_output_size > 0 ) {
500- task. packed_buffer_base = orch->pto2_alloc_packed_buffer (total_output_size);
501- if (!task. packed_buffer_base ) { orch->fatal = true ; return ; }
502- task. packed_buffer_end = (char *)task. packed_buffer_base + total_output_size;
475+ local_packed_base = orch->pto2_alloc_packed_buffer (total_output_size);
476+ if (!local_packed_base ) { orch->fatal = true ; return ; }
477+ local_packed_end = (char *)local_packed_base + total_output_size;
503478 }
504479 CYCLE_COUNT_LAP_RECORD (g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, local_id);
505480#if PTO2_ORCH_PROFILING
@@ -559,7 +534,7 @@ void pto2_submit_mixed_task(
559534 case PTOParamType::OUTPUT: {
560535 Tensor& tensor = *params.tensors [i];
561536 if (tensor.buffer .addr == 0 ) {
562- uint64_t alloc_addr = reinterpret_cast <uint64_t >((char *)task. packed_buffer_base + offset);
537+ uint64_t alloc_addr = reinterpret_cast <uint64_t >((char *)local_packed_base + offset);
563538 tensor.buffer .addr = alloc_addr;
564539 offset += PTO2_ALIGN_UP (tensor.buffer .size , PTO2_PACKED_OUTPUT_ALIGN);
565540 }
@@ -582,6 +557,16 @@ void pto2_submit_mixed_task(
582557
583558 CYCLE_COUNT_LAP_RECORD (g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, local_id);
584559
560+ // === Batch-write task descriptor to GM (single cache line burst) ===
561+ // Deferred from allocation phase to avoid scattered GM writes that get
562+ // evicted by TensorMap lookup/insert cache pressure.
563+ __builtin_prefetch (&task, 1 , 1 );
564+ task.mixed_task_id = mixed_task_id;
565+ task.kernel_id [static_cast <int >(PTO2SubtaskSlot::AIC)] = normalized.aic_kernel_id ;
566+ task.kernel_id [static_cast <int >(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id ;
567+ task.kernel_id [static_cast <int >(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id ;
568+ task.packed_buffer_base = local_packed_base;
569+ task.packed_buffer_end = local_packed_end;
585570
586571 // Prefetch producer slot_states and cur_slot_state (written at init but likely
587572 // evicted by lookup/insert/heap). param_copy below provides hide time.
@@ -657,6 +642,8 @@ void pto2_submit_mixed_task(
657642 PTO2ResourceShape shape = pto2_active_mask_to_shape (active_mask);
658643 sched->ready_queues [static_cast <int32_t >(shape)].push (&cur_slot_state);
659644 }
645+ // Record dep pool watermark in local slot state (used by tail reclamation)
646+ cur_slot_state.dep_pool_mark = orch->rings [ring_id].dep_pool .top ;
660647#if PTO2_ORCH_PROFILING
661648 // Per producer: fetch_add(fanout_count) + load(task_state) + store(unlock) = 3 atomics
662649 // Lock atomics (loads + CAS) are counted inside pto2_fanout_lock
@@ -667,9 +654,6 @@ void pto2_submit_mixed_task(
667654#endif
668655 }
669656
670- // Record dep pool watermark for this task (used by tail reclamation)
671- payload->dep_pool_mark = orch->rings [ring_id].dep_pool .top ;
672-
673657 CYCLE_COUNT_LAP_RECORD (g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, local_id);
674658
675659#if PTO2_PROFILING
0 commit comments