Refactor: move reclamation state into owning data structures (#315)

poursoul · web-flow · commit e4348ebb0197 · 2026-03-18T09:27:17.000+08:00
- Move dep_pool_mark from PTO2TaskPayload (GM) to PTO2TaskSlotState
  (local memory) to avoid GM cache line pollution
- Move last_reclaimed into PTO2DepListPool and last_cleanup into
  PTO2TensorMap, eliminating parallel arrays in orchestrator state
- Consolidate per-submit sm_last_task_alive read — single atomic load
  shared by tensormap sync and dep pool reclaim
- Simplify sync_tensormap to per-ring interface, removing multi-ring
  loop and MIN_FREE_NUM pressure heuristic
- Defer task descriptor GM writes until after tensor insertion to
  batch cache line stores and reduce eviction pressure
- Narrow ring_id type from int32_t to uint8_t throughout
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -143,7 +143,6 @@ bool pto2_orchestrator_init(
         pto2_dep_pool_init(&orch->rings[r].dep_pool, dep_entries, dep_pool_capacity);
         orch->rings[r].dep_pool.error_code_ptr = &sm_handle->header->orch_error_code;
         orch->dep_pool_cur_entries[r] = nullptr;
-        orch->dep_pool_last_reclaimed[r] = 0;
     }
 
     // Initialize TensorMap with per-ring task window sizes
@@ -158,9 +157,6 @@ bool pto2_orchestrator_init(
         return false;
     }
     orch->tensor_map.orch = orch;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->tensormap_last_cleanup[r] = 0;
-    }
 
     // Initialize scope stack: one flat buffer for task IDs + one array for begin offsets
     uint64_t max_depth = PTO2_MAX_SCOPE_DEPTH;
@@ -203,40 +199,18 @@ void pto2_orchestrator_set_scheduler(PTO2OrchestratorState* orch, PTO2SchedulerS
 }
 
 
-// =============================================================================
-// Dep Pool Reclamation
-// =============================================================================
-
-/**
- * Reclaim dead dep pool entries for a specific ring based on scheduler's last_task_alive.
- * Safe to call multiple times — only advances tail forward.
- */
-static void pto2_dep_pool_reclaim(PTO2OrchestratorState* orch, int32_t ring_id) {
-    int32_t last_alive =
-        orch->sm_handle->header->rings[ring_id].fc.last_task_alive.load(std::memory_order_acquire);
-    if (last_alive > orch->dep_pool_last_reclaimed[ring_id] && last_alive > 0) {
-        int32_t newest_consumed = last_alive - 1;
-        int32_t slot_rc = orch->rings[ring_id].task_ring.get_task_slot(newest_consumed);
-        int32_t mark = orch->sm_handle->task_payloads[ring_id][slot_rc].dep_pool_mark;
-        if (mark > 0) {
-            orch->rings[ring_id].dep_pool.advance_tail(mark);
-        }
-        orch->dep_pool_last_reclaimed[ring_id] = last_alive;
-    }
-}
-
 /**
  * Ensure dep pool for a specific ring has at least `needed` entries available.
  * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
  */
-static void pto2_dep_pool_ensure_space(PTO2OrchestratorState* orch, int32_t ring_id, int32_t needed) {
+static void pto2_dep_pool_ensure_space(PTO2OrchestratorState* orch, uint8_t ring_id, int32_t needed) {
     if (pto2_dep_pool_available(&orch->rings[ring_id].dep_pool) >= needed) return;
 
     int spin_count = 0;
     int32_t prev_last_alive =
         orch->sm_handle->header->rings[ring_id].fc.last_task_alive.load(std::memory_order_acquire);
     while (pto2_dep_pool_available(&orch->rings[ring_id].dep_pool) < needed) {
-        pto2_dep_pool_reclaim(orch, ring_id);
+        orch->rings[ring_id].dep_pool.reclaim(orch->scheduler, ring_id, prev_last_alive);
         if (pto2_dep_pool_available(&orch->rings[ring_id].dep_pool) >= needed) return;
 
         spin_count++;
@@ -334,7 +308,11 @@ void pto2_scope_end(PTO2OrchestratorState* orch) {
 void pto2_submit_mixed_task(
     PTO2OrchestratorState* orch, const MixedKernels& mixed_kernels, const PTOParam& params) {
     // Fast path after fatal error — all subsequent submits are no-ops
-    if (orch->fatal) { return; }
+    if (orch->fatal) {
+        return;
+    }
+    
+    PTO2SchedulerState* sched = orch->scheduler;
 
     // Validate PTOParam construction (errors recorded by add_input/add_output/etc.)
     if (params.has_error) {
@@ -370,14 +348,20 @@ void pto2_submit_mixed_task(
     }
 
     // === STEP 0: Sync TensorMap validity and optional cleanup ===
-    orch->tensor_map.sync_tensormap();
 
     // Determine which ring this task belongs to
-    int32_t ring_id = orch->current_ring_id();
+    uint8_t ring_id = orch->current_ring_id();
     auto& task_ring = orch->rings[ring_id].task_ring;
 
-    // Reclaim dead dep pool entries based on scheduler's last_task_alive
-    pto2_dep_pool_reclaim(orch, ring_id);
+    // Read current last_task_alive from shared memory for this ring
+    int32_t sm_last_task_alive =
+        orch->sm_handle->header->rings[ring_id].fc.last_task_alive.load(std::memory_order_acquire);
+
+    orch->tensor_map.sync_tensormap(ring_id, sm_last_task_alive);
+
+    if (sched) {
+        orch->rings[ring_id].dep_pool.reclaim(sched, ring_id, sm_last_task_alive);
+    }
 
     CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, -1);
 
@@ -427,8 +411,7 @@ void pto2_submit_mixed_task(
     int32_t local_id = task_ring.pto2_task_ring_alloc();
     if (local_id < 0) { orch->fatal = true; return; }
     int32_t slot = task_ring.get_task_slot(local_id);
-    PTO2TaskId mixed_task_id =
-        pto2_make_task_id(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id));
+    PTO2TaskId mixed_task_id = pto2_make_task_id(ring_id, static_cast<uint32_t>(local_id));
 
     PTO2TaskDescriptor& task = task_ring.get_task_by_slot(slot);
     PTO2TaskPayload* payload = &orch->sm_handle->task_payloads[ring_id][slot];
@@ -444,21 +427,11 @@ void pto2_submit_mixed_task(
     for (int32_t j = 0; j < params.scalar_count; j += 8) {
         __builtin_prefetch(&payload->scalars[j], 1, 3);
     }
-    // Metadata area: tensor_count, scalar_count, fanin_slot_states[] — all in first 3 CLs
     __builtin_prefetch(payload, 1, 3);
     __builtin_prefetch(reinterpret_cast<char*>(payload) + 64, 1, 3);
     __builtin_prefetch(reinterpret_cast<char*>(payload) + 128, 1, 3);
 
-    // Initialize mixed-task descriptor
-    task.mixed_task_id = mixed_task_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)]  = normalized.aic_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id;
-    task.packed_buffer_base = NULL;
-    task.packed_buffer_end = NULL;
-
     // Initialize slot state (scheduler-private)
-    PTO2SchedulerState* sched = orch->scheduler;
     if (sched) {
         auto& rs = sched->ring_sched_states[ring_id];
         PTO2TaskSlotState& slot_state = rs.get_slot_state_by_slot(slot);
@@ -473,7 +446,7 @@ void pto2_submit_mixed_task(
         slot_state.task = &task;
         slot_state.active_mask = active_mask;
         slot_state.subtask_done_mask.store(0, std::memory_order_relaxed);
-        slot_state.ring_id = static_cast<uint8_t>(ring_id);
+        slot_state.ring_id = ring_id;
         scope_tasks_push(orch, &slot_state);
     } else {
         scope_tasks_push(orch, nullptr);
@@ -496,10 +469,12 @@ void pto2_submit_mixed_task(
         }
     }
 
+    void* local_packed_base = nullptr;
+    void* local_packed_end = nullptr;
     if (total_output_size > 0) {
-        task.packed_buffer_base = orch->pto2_alloc_packed_buffer(total_output_size);
-        if (!task.packed_buffer_base) { orch->fatal = true; return; }
-        task.packed_buffer_end = (char*)task.packed_buffer_base + total_output_size;
+        local_packed_base = orch->pto2_alloc_packed_buffer(total_output_size);
+        if (!local_packed_base) { orch->fatal = true; return; }
+        local_packed_end = (char*)local_packed_base + total_output_size;
     }
     CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, local_id);
 #if PTO2_ORCH_PROFILING
@@ -559,7 +534,7 @@ void pto2_submit_mixed_task(
             case PTOParamType::OUTPUT: {
                 Tensor& tensor = *params.tensors[i];
                 if (tensor.buffer.addr == 0) {
-                    uint64_t alloc_addr = reinterpret_cast<uint64_t>((char*)task.packed_buffer_base + offset);
+                    uint64_t alloc_addr = reinterpret_cast<uint64_t>((char*)local_packed_base + offset);
                     tensor.buffer.addr = alloc_addr;
                     offset += PTO2_ALIGN_UP(tensor.buffer.size, PTO2_PACKED_OUTPUT_ALIGN);
                 }
@@ -582,6 +557,16 @@ void pto2_submit_mixed_task(
 
     CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, local_id);
 
+    // === Batch-write task descriptor to GM (single cache line burst) ===
+    // Deferred from allocation phase to avoid scattered GM writes that get
+    // evicted by TensorMap lookup/insert cache pressure.
+    __builtin_prefetch(&task, 1, 1);
+    task.mixed_task_id = mixed_task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)]  = normalized.aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id;
+    task.packed_buffer_base = local_packed_base;
+    task.packed_buffer_end = local_packed_end;
 
     // Prefetch producer slot_states and cur_slot_state (written at init but likely
     // evicted by lookup/insert/heap). param_copy below provides hide time.
@@ -657,6 +642,8 @@ void pto2_submit_mixed_task(
             PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask);
             sched->ready_queues[static_cast<int32_t>(shape)].push(&cur_slot_state);
         }
+        // Record dep pool watermark in local slot state (used by tail reclamation)
+        cur_slot_state.dep_pool_mark = orch->rings[ring_id].dep_pool.top;
 #if PTO2_ORCH_PROFILING
         // Per producer: fetch_add(fanout_count) + load(task_state) + store(unlock) = 3 atomics
         // Lock atomics (loads + CAS) are counted inside pto2_fanout_lock
@@ -667,9 +654,6 @@ void pto2_submit_mixed_task(
 #endif
     }
 
-    // Record dep pool watermark for this task (used by tail reclamation)
-    payload->dep_pool_mark = orch->rings[ring_id].dep_pool.top;
-
     CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, local_id);
 
 #if PTO2_PROFILING
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -42,11 +42,9 @@ struct PTO2OrchestratorState {
     // === PER-RING RESOURCES ===
     PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
     PTO2DepListEntry* dep_pool_cur_entries[PTO2_MAX_RING_DEPTH];
-    int32_t dep_pool_last_reclaimed[PTO2_MAX_RING_DEPTH];
 
     // === TENSOR MAP (Private) ===
     PTO2TensorMap tensor_map;        // Producer lookup
-    int32_t tensormap_last_cleanup[PTO2_MAX_RING_DEPTH];
 
     // === SCOPE STACK (Private) ===
     // Single contiguous buffer of task IDs, partitioned by scope level.
@@ -88,10 +86,10 @@ struct PTO2OrchestratorState {
      * Get current ring index from scope depth.
      * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
      */
-    int32_t current_ring_id() const {
+    uint8_t current_ring_id() const {
         int32_t depth = scope_stack_top;
         if (depth < 0) depth = 0;
-        return depth < PTO2_MAX_RING_DEPTH ? depth : PTO2_MAX_RING_DEPTH - 1;
+        return depth < PTO2_MAX_RING_DEPTH ? static_cast<uint8_t>(depth) : PTO2_MAX_RING_DEPTH - 1;
     }
 
     /**
@@ -102,7 +100,7 @@ struct PTO2OrchestratorState {
             return NULL;
         }
 
-        int32_t rid = current_ring_id();
+        uint8_t rid = current_ring_id();
         void* buffer = rings[rid].heap_ring.pto2_heap_ring_alloc(total_size);
 
 #if PTO2_PROFILING
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
@@ -12,6 +12,7 @@
 #include <string.h>
 #include <stdlib.h>  // for exit()
 #include "common/unified_log.h"
+#include "pto_scheduler.h"
 
 // =============================================================================
 // Heap Ring Buffer Implementation
@@ -49,12 +50,23 @@ void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, int32_t c
     pool->top = 1;  // Start from 1, 0 means NULL/empty
     pool->tail = 1; // Match initial top (no reclaimable entries yet)
     pool->high_water = 0;
+    pool->last_reclaimed = 0;
 
     // Initialize entry 0 as NULL marker
     pool->base[0].slot_state = nullptr;
     pool->base[0].next = nullptr;
 }
 
+void PTO2DepListPool::reclaim(PTO2SchedulerState* sched, uint8_t ring_id, int32_t sm_last_task_alive) {
+    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
+        int32_t mark = sched->ring_sched_states[ring_id].get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
+        if (mark > 0) {
+            advance_tail(mark);
+        }
+        last_reclaimed = sm_last_task_alive;
+    }
+}
+
 int32_t pto2_dep_pool_used(PTO2DepListPool* pool) {
     return pool->top - pool->tail;
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -32,6 +32,8 @@
 #include "pto_shared_memory.h"
 #include "common/unified_log.h"
 
+struct PTO2SchedulerState;  // Forward declaration for dep_pool reclaim
+
 // Set to 1 to enable periodic BLOCKED/Unblocked messages during spin-wait.
 #ifndef PTO2_SPIN_VERBOSE_LOGGING
 #define PTO2_SPIN_VERBOSE_LOGGING 1
@@ -468,10 +470,21 @@ struct PTO2DepListPool {
     int32_t top;              // Linear next-allocation counter (starts from 1)
     int32_t tail;             // Linear first-alive counter (entries before this are dead)
     int32_t high_water;       // Peak concurrent usage (top - tail)
+    int32_t last_reclaimed{0}; // last_task_alive at last successful reclamation
 
     // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
     std::atomic<int32_t>* error_code_ptr = nullptr;
 
+    /**
+     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
+     * Safe to call multiple times — only advances tail forward.
+     *
+     * @param sched              Scheduler state (for reading slot dep_pool_mark)
+     * @param ring_id            Ring layer index
+     * @param sm_last_task_alive Current last_task_alive from shared memory
+     */
+    void reclaim(PTO2SchedulerState* sched, uint8_t ring_id, int32_t sm_last_task_alive);
+
     /**
      * Allocate a single entry from the pool (single-thread per pool instance)
      *
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -103,6 +103,7 @@
 
 // TensorMap cleanup interval
 #define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
+#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
 
 // =============================================================================
 // Multi-Ring task_id Encoding
@@ -366,7 +367,7 @@ struct PTO2TaskPayload {
     int32_t tensor_count{0};
     int32_t scalar_count{0};
     int32_t fanin_actual_count{0};             // Actual fanin count (without the +1 redundance)
-    int32_t dep_pool_mark{0};                  // Dep pool top after this task's submission (for reclamation)
+    int32_t _reserved{0};                      // Reserved (dep_pool_mark moved to SlotState for local access)
     PTO2TaskSlotState* fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release)
     // === Cache lines 3-34 (2048B) — tensors (alignas(64) forces alignment) ===
     Tensor tensors[PTO2_MAX_TENSOR_PARAMS];
@@ -425,6 +426,7 @@ struct alignas(64) PTO2TaskSlotState {
     uint8_t active_mask;                         // Bitmask of active subtask slots (set once)
     std::atomic<uint8_t> subtask_done_mask;      // Each subtask sets its done bit on completion
     uint8_t ring_id;                             // Ring layer this task belongs to (for per-ring reclamation)
+    int32_t dep_pool_mark{0};                    // Dep pool top after this task's submission (orchestrator-only, local memory)
 };
 
 static_assert(sizeof(PTO2TaskSlotState) == 64);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
@@ -115,6 +115,7 @@ bool PTO2TensorMap::init(int32_t new_num_buckets, int32_t new_pool_size, const i
 
     for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
         last_task_alives[r] = 0;
+        last_cleanup[r] = 0;
     }
 
     return true;
@@ -220,27 +221,13 @@ int32_t PTO2TensorMap::valid_count() {
     return count;
 }
 
-void PTO2TensorMap::sync_tensormap() {
-    constexpr int MIN_FREE_NUM = 1024;
-    always_assert(orch != nullptr);
-    while(true) {
-        bool did_cleanup = false;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            // Read current last_task_alive from shared memory for this ring
-            int32_t new_last_task_alive =
-                orch->sm_handle->header->rings[r].fc.last_task_alive.load(std::memory_order_acquire);
-            sync_validity(r, new_last_task_alive);
-            // Only attempt cleanup when last_task_alive has actually advanced;
-            // otherwise cleanup_retired would empty-loop and we'd spin forever.
-            if (new_last_task_alive <= orch->tensormap_last_cleanup[r]) continue;
-            if ((pool_size - next_entry_idx + free_num < MIN_FREE_NUM) ||
-                new_last_task_alive - orch->tensormap_last_cleanup[r] >= PTO2_TENSORMAP_CLEANUP_INTERVAL) {
-                cleanup_retired(r, orch->tensormap_last_cleanup[r], new_last_task_alive);
-                orch->tensormap_last_cleanup[r] = new_last_task_alive;
-                did_cleanup = true;
-            }
-        }
-        if (!did_cleanup) break;
+void PTO2TensorMap::sync_tensormap(uint8_t ring_id, int32_t sm_last_task_alive) {
+    sync_validity(ring_id, sm_last_task_alive);
+    // Only attempt cleanup when last_task_alive has actually advanced;
+    // otherwise cleanup_retired would empty-loop and we'd spin forever.
+    if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL) {
+        cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
+        last_cleanup[ring_id] = sm_last_task_alive;
     }
 }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -198,6 +198,9 @@ struct PTO2TensorMap {
     // Per-ring validity threshold (for lazy invalidation)
     int32_t last_task_alives[PTO2_MAX_RING_DEPTH];  // Cached from shared memory per ring
 
+    // Per-ring cleanup progress (for periodic cleanup_retired)
+    int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
+
     PTO2OrchestratorState* orch{nullptr};
 
     // new_entry目前不负责分配属性，仅分配内存
@@ -500,7 +503,7 @@ struct PTO2TensorMap {
      * Called periodically to refresh the lazy invalidation threshold.
      * Also triggers cleanup if threshold has advanced significantly.
      */
-    void sync_tensormap();
+    void sync_tensormap(uint8_t ring_id, int32_t sm_last_task_alive);
 };
 
 #if PTO2_TENSORMAP_PROFILING