From 2805c1140e0439b95696aa1dbf0d1e1219fed23f Mon Sep 17 00:00:00 2001
From: liaoheng <hengliao1972@163.com>
Date: Thu, 26 Feb 2026 12:15:27 +0800
Subject: [PATCH 1/6] Enhance: platform and runtime infrastructure for paged
 attention

- Add cache flush (dc cvac) for tensor_copies in orchestrator to ensure
  AICore sees correct tensor metadata via HBM
- Improve AICPU executor with cycle-accurate profiling, scheduler phase
  breakdown (dispatch/complete/scan/yield), and enhanced task statistics
- Extend memory allocator with larger heap support and alignment helpers
- Add platform config tuning for device runner and register access
---
 src/platform/a2a3/host/device_runner.cpp      |  60 ++-
 src/platform/a2a3/host/host_regs.cpp          |   7 +-
 src/platform/a2a3/host/memory_allocator.cpp   |  31 +-
 .../a2a3sim/host/memory_allocator.cpp         |   7 +
 src/platform/include/common/platform_config.h |   6 +-
 src/platform/include/host/memory_allocator.h  |   9 +
 .../aicpu/aicpu_executor.cpp                  | 492 +++++++++++++-----
 .../runtime/pto_orchestrator.cpp              |  52 +-
 .../runtime/pto_runtime2_types.h              |  22 +
 .../runtime/pto_types.h                       |   5 +-
 .../tensormap_and_ringbuffer/runtime/tensor.h |   8 +-
 11 files changed, 521 insertions(+), 178 deletions(-)

diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp
index e05eaefcf..0fe992b1f 100644
--- a/src/platform/a2a3/host/device_runner.cpp
+++ b/src/platform/a2a3/host/device_runner.cpp
@@ -411,6 +411,9 @@ int DeviceRunner::run(Runtime& runtime,
     }
 
     std::cout << "\n=== rtStreamSynchronize stream_aicpu_===" << '\n';
+    std::cout << "(AICPU progress/heap logs go to device log, not here. If this hangs, check: "
+              << "grep -E 'PTO2|HeapRing' $HOME/ascend/log/debug/device-" << device_id_ << "/*.log)"
+              << std::endl;
     // Synchronize streams
     rc = rtStreamSynchronize(stream_aicpu_);
     if (rc != 0) {
@@ -460,6 +463,25 @@ int DeviceRunner::finalize() {
         return 0;
     }
 
+    // Ensure we are on the correct device before any rtFree (finalize may run from
+    // destructor after Python/runtime_maker has run; current device might have changed).
+    if (device_id_ >= 0) {
+        int set_rc = rtSetDevice(static_cast<uint32_t>(device_id_));
+        if (set_rc != 0) {
+            LOG_ERROR("rtSetDevice(%d) failed: %d (non-fatal)", device_id_, set_rc);
+        }
+    }
+
+    // Ensure all device work (including any async copies) is complete before freeing.
+    // This can avoid rtFree returning 507899 when device is still busy.
+    // CANN rtDeviceSynchronize() takes no arguments (syncs current device).
+    {
+        int sync_rc = rtDeviceSynchronize();
+        if (sync_rc != 0) {
+            LOG_ERROR("rtDeviceSynchronize failed: %d (non-fatal, continuing finalize)", sync_rc);
+        }
+    }
+
     // Print handshake results before cleanup (reads from device memory)
     print_handshake_results();
 
@@ -472,21 +494,10 @@ int DeviceRunner::finalize() {
     // Cleanup AICPU SO
     so_info_.finalize();
 
-    // Clear kernel address mapping
-    func_id_to_addr_.clear();
-    binaries_loaded_ = false;
-
-    // Destroy streams
-    if (stream_aicpu_ != nullptr) {
-        rtStreamDestroy(stream_aicpu_);
-        stream_aicpu_ = nullptr;
-    }
-    if (stream_aicore_ != nullptr) {
-        rtStreamDestroy(stream_aicore_);
-        stream_aicore_ = nullptr;
-    }
-
-    // Cleanup performance profiling
+    // Cleanup performance profiling and free all device memory *before* destroying
+    // streams. CANN rtFree can fail (e.g. 507899) if streams are destroyed first.
+    // After halHostUnregister, CANN may have already freed the perf buffer; calling
+    // rtFree on it causes 507899. So we pass a callback that only untracks the pointer.
     if (perf_collector_.is_initialized()) {
         auto unregister_cb = [](void* host_ptr, int device_id, void* user_data) -> int {
             (void)user_data;
@@ -499,15 +510,30 @@ int DeviceRunner::finalize() {
 
         auto free_cb = [](void* dev_ptr, void* user_data) -> int {
             auto* allocator = static_cast<MemoryAllocator*>(user_data);
-            return allocator->free(dev_ptr);
+            allocator->untrack(dev_ptr);
+            return 0;
         };
 
         perf_collector_.finalize(unregister_cb, free_cb, &mem_alloc_);
     }
 
-    // Free all remaining allocations (including handshake buffer and binGmAddr)
+    // Free all remaining allocations (kernel binaries, regs, etc.) before stream destroy
     mem_alloc_.finalize();
 
+    // Clear kernel address mapping (no longer valid after mem_alloc_.finalize())
+    func_id_to_addr_.clear();
+    binaries_loaded_ = false;
+
+    // Destroy streams after all device memory is freed
+    if (stream_aicpu_ != nullptr) {
+        rtStreamDestroy(stream_aicpu_);
+        stream_aicpu_ = nullptr;
+    }
+    if (stream_aicore_ != nullptr) {
+        rtStreamDestroy(stream_aicore_);
+        stream_aicore_ = nullptr;
+    }
+
     device_id_ = -1;
     worker_count_ = 0;
     aicore_kernel_binary_.clear();
diff --git a/src/platform/a2a3/host/host_regs.cpp b/src/platform/a2a3/host/host_regs.cpp
index 38b143bdd..b5f9abdbc 100644
--- a/src/platform/a2a3/host/host_regs.cpp
+++ b/src/platform/a2a3/host/host_regs.cpp
@@ -114,9 +114,10 @@ void get_aicore_regs(std::vector<int64_t>& regs, uint64_t device_id) {
     int rt = get_aicore_reg_info(aic, aiv, ADDR_MAP_TYPE_REG_AIC_CTRL, device_id);
 
     if (rt != 0) {
-        LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses");
-        // Fallback: generate placeholder addresses
-        for (int i = 0; i < 25; i++) {
+        LOG_ERROR("get_aicore_reg_info failed (rc=%d), using placeholder addresses", rt);
+        LOG_WARN("Placeholder addresses are NOT valid AICore MMIO bases; AICore kernels will not run and the process may hang or never complete. Fix HAL/permissions and re-run.");
+        // Fallback: generate placeholder addresses (invalid for real execution)
+        for (int i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
             aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000));  // 8M stride
             aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
             aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
diff --git a/src/platform/a2a3/host/memory_allocator.cpp b/src/platform/a2a3/host/memory_allocator.cpp
index 269ec63ee..cb5586bc3 100644
--- a/src/platform/a2a3/host/memory_allocator.cpp
+++ b/src/platform/a2a3/host/memory_allocator.cpp
@@ -26,6 +26,13 @@ void* MemoryAllocator::alloc(size_t size) {
     return ptr;
 }
 
+void MemoryAllocator::untrack(void* ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
+    ptr_set_.erase(ptr);
+}
+
 int MemoryAllocator::free(void* ptr) {
     if (ptr == nullptr) {
         return 0;
@@ -38,10 +45,14 @@ int MemoryAllocator::free(void* ptr) {
         return 0;
     }
 
-    // Free the memory
+    // Free the memory. CANN may return 507899 during teardown (known quirk); log as warning.
     int rc = rtFree(ptr);
     if (rc != 0) {
-        LOG_ERROR("rtFree failed: %d", rc);
+        if (rc == 507899) {
+            LOG_WARN("rtFree returned 507899 (CANN teardown quirk, non-fatal): %d", rc);
+        } else {
+            LOG_ERROR("rtFree failed: %d", rc);
+        }
         return rc;
     }
 
@@ -58,17 +69,21 @@ int MemoryAllocator::finalize() {
 
     int last_error = 0;
 
-    // Free all remaining tracked pointers
-    for (void* ptr : ptr_set_) {
+    // Free all remaining tracked pointers. On rtFree failure (e.g. CANN 507899),
+    // still remove from set to avoid double-free; continue freeing others.
+    for (auto it = ptr_set_.begin(); it != ptr_set_.end(); ) {
+        void* ptr = *it;
         int rc = rtFree(ptr);
         if (rc != 0) {
-            LOG_ERROR("rtFree failed during Finalize: %d", rc);
+            if (rc == 507899) {
+                LOG_WARN("rtFree during Finalize returned 507899 (CANN teardown quirk, non-fatal): %d", rc);
+            } else {
+                LOG_ERROR("rtFree failed during Finalize: %d", rc);
+            }
             last_error = rc;
         }
+        it = ptr_set_.erase(it);
     }
-
-    // Clear the set
-    ptr_set_.clear();
     finalized_ = true;
 
     return last_error;
diff --git a/src/platform/a2a3sim/host/memory_allocator.cpp b/src/platform/a2a3sim/host/memory_allocator.cpp
index 310fd2c79..43fc2d501 100644
--- a/src/platform/a2a3sim/host/memory_allocator.cpp
+++ b/src/platform/a2a3sim/host/memory_allocator.cpp
@@ -25,6 +25,13 @@ void* MemoryAllocator::alloc(size_t size) {
     return ptr;
 }
 
+void MemoryAllocator::untrack(void* ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
+    ptr_set_.erase(ptr);
+}
+
 int MemoryAllocator::free(void* ptr) {
     if (ptr == nullptr) {
         return 0;
diff --git a/src/platform/include/common/platform_config.h b/src/platform/include/common/platform_config.h
index 2909157a1..6bce70678 100644
--- a/src/platform/include/common/platform_config.h
+++ b/src/platform/include/common/platform_config.h
@@ -173,10 +173,12 @@ constexpr uint8_t PLATFORM_AICORE_BITMAP_LEN = 2;
 constexpr uint32_t PLATFORM_SUB_CORES_PER_AICORE = PLATFORM_CORES_PER_BLOCKDIM;
 
 /**
- * Maximum physical AICore count for DAV 2201 chip
+ * Maximum physical AICore count for DAV 2201 chip.
+ * MUST use 24 AIC + 48 AIV only. Do NOT use 25/50 (causes runtime failures).
  */
 namespace DAV_2201 {
-constexpr uint32_t PLATFORM_MAX_PHYSICAL_CORES = 25;
+constexpr uint32_t PLATFORM_MAX_PHYSICAL_CORES = 24;
+static_assert(PLATFORM_MAX_PHYSICAL_CORES == 24u, "Use 24 AIC + 48 AIV only; 25/50 is invalid");
 }
 
 #endif  // PLATFORM_COMMON_PLATFORM_CONFIG_H_
diff --git a/src/platform/include/host/memory_allocator.h b/src/platform/include/host/memory_allocator.h
index c14e2be61..d30459e95 100644
--- a/src/platform/include/host/memory_allocator.h
+++ b/src/platform/include/host/memory_allocator.h
@@ -53,6 +53,15 @@ class MemoryAllocator {
      */
     void* alloc(size_t size);
 
+    /**
+     * Remove pointer from tracking without freeing (e.g. after halHostUnregister
+     * which may have already freed the device memory; calling rtFree would fail with 507899).
+     *
+     * @param ptr  Memory pointer to remove from tracking
+     * @return 0 if removed, 0 if ptr not tracked (no-op)
+     */
+    void untrack(void* ptr);
+
     /**
      * Free memory if tracked
      *
diff --git a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 760214d91..94fc473ba 100644
--- a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -61,6 +61,8 @@ constexpr int MAX_CORES_PER_THREAD = MAX_AIC_PER_THREAD + MAX_AIV_PER_THREAD;
 // Maximum tasks for ready queue (PTO2 mode uses shared memory task count)
 constexpr int AICPU_MAX_READY_TASKS = 16384;
 constexpr int AICPU_READY_MASK = AICPU_MAX_READY_TASKS - 1;
+// 3 shards per type: each scheduler thread pushes to its own shard (thread_idx % 3), pops own first + work stealing
+constexpr int PTO2_READY_QUEUE_SHARDS = 3;
 
 // Lightweight spinlock (avoids futex syscall overhead of std::mutex)
 struct SpinLock {
@@ -95,17 +97,16 @@ struct AicpuExecutor {
     int aic_count_{0};
     int aiv_count_{0};
 
-    // ===== Task queue state (FIFO circular queue, aligned with host_build_graph) =====
-    // ===== Spinlock-based MPMC ready queues (lighter than std::mutex) =====
-    SpinLock ready_queue_aic_lock_;
-    int ready_queue_aic_[AICPU_MAX_READY_TASKS];
-    int ready_queue_aic_head_{0};
-    int ready_queue_aic_tail_{0};
+    // ===== 3 shards per type: push to own shard (thread_idx % 3), pop own first + work stealing =====
+    SpinLock ready_queue_aic_lock_[PTO2_READY_QUEUE_SHARDS];
+    int ready_queue_aic_[PTO2_READY_QUEUE_SHARDS][AICPU_MAX_READY_TASKS];
+    int ready_queue_aic_head_[PTO2_READY_QUEUE_SHARDS]{0};
+    int ready_queue_aic_tail_[PTO2_READY_QUEUE_SHARDS]{0};
 
-    SpinLock ready_queue_aiv_lock_;
-    int ready_queue_aiv_[AICPU_MAX_READY_TASKS];
-    int ready_queue_aiv_head_{0};
-    int ready_queue_aiv_tail_{0};
+    SpinLock ready_queue_aiv_lock_[PTO2_READY_QUEUE_SHARDS];
+    int ready_queue_aiv_[PTO2_READY_QUEUE_SHARDS][AICPU_MAX_READY_TASKS];
+    int ready_queue_aiv_head_[PTO2_READY_QUEUE_SHARDS]{0};
+    int ready_queue_aiv_tail_[PTO2_READY_QUEUE_SHARDS]{0};
 
     // Task execution tracking
     std::atomic<int> completed_tasks_{0};
@@ -302,10 +303,12 @@ int AicpuExecutor::init(Runtime* runtime) {
     orchestrator_done_.store(orch_on_host, std::memory_order_release);
 
     // Initial ready tasks will be populated from PTO2 shared memory in resolve_and_dispatch_pto2
-    ready_queue_aic_head_ = 0;
-    ready_queue_aic_tail_ = 0;
-    ready_queue_aiv_head_ = 0;
-    ready_queue_aiv_tail_ = 0;
+    for (int s = 0; s < PTO2_READY_QUEUE_SHARDS; s++) {
+        ready_queue_aic_head_[s] = 0;
+        ready_queue_aic_tail_[s] = 0;
+        ready_queue_aiv_head_[s] = 0;
+        ready_queue_aiv_tail_[s] = 0;
+    }
 
     // Reset per-core dispatch timestamps and task counters
     for (int i = 0; i < RUNTIME_MAX_WORKER; i++) {
@@ -430,8 +433,9 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
     int cur_thread_completed = 0;
     int cur_thread_tasks_in_flight = 0;
     int idle_iterations = 0;
-    const int MAX_IDLE_ITERATIONS = 50000000;
+    const int MAX_IDLE_ITERATIONS = 800000;  // ~20s idle then scheduler gives up (avoid long hang)
     const int WARN_INTERVAL = 1000000;
+    const int STALL_LOG_INTERVAL = 50000;  // DEV_ALWAYS every N idle iters to debug hang
     bool profiling_enabled = runtime->enable_profiling;
     int32_t last_reported_task_count = 0;
 
@@ -444,6 +448,12 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
     uint64_t sched_yield_cycle = 0;
     uint64_t sched_loop_count = 0;
     uint64_t sched_yield_count = 0;
+    uint64_t sched_scan_ready_wait = 0, sched_scan_ready_hold = 0;
+    uint64_t sched_orch_ready_wait = 0, sched_orch_ready_hold = 0;
+    uint64_t sched_complete_fanout_spin = 0, sched_complete_fanout_hold = 0;
+    uint64_t sched_complete_ready_wait = 0, sched_complete_ready_hold = 0;
+    uint64_t sched_dispatch_ready_wait = 0, sched_dispatch_ready_hold = 0;
+    uint64_t ready_pop_own = 0, ready_pop_steal = 0;
 #endif
     // Fanout traversal statistics
     uint64_t total_fanout_traversed = 0;
@@ -470,85 +480,9 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
 
         bool made_progress = false;
 
-        // Incremental scan: discover root tasks (fanin_count == 0)
-        {
-            int32_t visible = __atomic_load_n(&header->current_task_index, __ATOMIC_ACQUIRE);
-
-            // Update perf header total_tasks if visible tasks have changed
-            if (profiling_enabled && visible > 0 && visible != last_reported_task_count) {
-                perf_aicpu_update_total_tasks(runtime, static_cast<uint32_t>(visible));
-
-                DEV_INFO("Thread %d: Updated perf total_tasks to %d%s",
-                            thread_idx, visible, orch_done ? " (final)" : "");
-
-                last_reported_task_count = visible;
-            }
-
-            while (true) {
-                int32_t idx = next_scan_index_.load(std::memory_order_acquire);
-                if (idx >= visible) break;
-                if (!next_scan_index_.compare_exchange_weak(idx, idx + 1,
-                        std::memory_order_acq_rel, std::memory_order_acquire)) continue;
-
-                int32_t slot = idx & window_mask;
-
-                PTO2TaskDescriptor* t = &task_descriptors[slot];
-                int32_t fanin_count = __atomic_load_n(&t->fanin_count, __ATOMIC_ACQUIRE);
-                if (fanin_count == 0) {
-                    // Mark as enqueued (state=1) to prevent double-enqueue
-                    __atomic_store_n(&s_pto2_task_completed[slot], 1, __ATOMIC_RELEASE);
-                    int32_t wt = t->worker_type;
-                    if (wt == PTO2_WORKER_CUBE) {
-                        ready_queue_aic_lock_.lock();
-                        ready_queue_aic_[ready_queue_aic_tail_++ & AICPU_READY_MASK] = idx;
-                        ready_queue_aic_lock_.unlock();
-                    } else {
-                        ready_queue_aiv_lock_.lock();
-                        ready_queue_aiv_[ready_queue_aiv_tail_++ & AICPU_READY_MASK] = idx;
-                        ready_queue_aiv_lock_.unlock();
-                    }
-                    made_progress = true;
-                }
-            }
-        }
-        CYCLE_COUNT_LAP(sched_scan_cycle);
-
-
-        // Drain orchestrator ready queue: tasks made ready by orchestrator's early-return path
-        // (producer already completed → refcount incremented directly, consumer pushed to queue)
-        if (orch_ready_queue_ != nullptr) {
-            while (true) {
-                int32_t head = __atomic_load_n(orch_ready_head_, __ATOMIC_ACQUIRE);
-                int32_t tail = __atomic_load_n(orch_ready_tail_, __ATOMIC_ACQUIRE);
-                if (head == tail) break;  // queue empty
-
-                // CAS to claim this slot (multiple scheduler threads compete)
-                if (!__atomic_compare_exchange_n(orch_ready_head_, &head, head + 1,
-                        false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) continue;
-
-                int32_t task_id = orch_ready_queue_[head & (orch_ready_capacity_ - 1)];
-                int32_t slot = task_id & window_mask;
-
-                // CAS from 0 → 1 to claim enqueue rights (may already be enqueued by fanout path)
-                int32_t expected = 0;
-                if (!__atomic_compare_exchange_n(&s_pto2_task_completed[slot], &expected, 1,
-                        false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) continue;
-
-                PTO2TaskDescriptor* t = &task_descriptors[slot];
-                int32_t wt = t->worker_type;
-                if (wt == PTO2_WORKER_CUBE) {
-                    ready_queue_aic_lock_.lock();
-                    ready_queue_aic_[ready_queue_aic_tail_++ & AICPU_READY_MASK] = task_id;
-                    ready_queue_aic_lock_.unlock();
-                } else {
-                    ready_queue_aiv_lock_.lock();
-                    ready_queue_aiv_[ready_queue_aiv_tail_++ & AICPU_READY_MASK] = task_id;
-                    ready_queue_aiv_lock_.unlock();
-                }
-                made_progress = true;
-            }
-        }
-        CYCLE_COUNT_LAP(sched_orch_drain_cycle);
+        // Process completed and dispatch FIRST to minimize Sched (dispatch→finish) latency.
+        // Sched time = finish_ts - dispatch_ts; recording finish_ts here at loop start reduces
+        // tail overhead (time from AICore done to AICPU recording finish).
 
         // Phase 1: Process completed tasks (Handshake.task = PTO2DispatchPayload*)
         for (int i = 0; i < core_num; i++) {
@@ -579,13 +513,48 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
 
                 DEV_DEBUG("Thread %d: Core %d completed PTO2 task %d", thread_idx, core_id, task_id);
 
-                // Acquire fanout_lock, mark completed (state=2), snapshot fanout_head
-                while (PTO2_EXCHANGE(&pto2_task->fanout_lock, 1) != 0) { PTO2_SPIN_PAUSE_LIGHT(); }
+                // Mark completed (state=2), then snapshot fanout_head under the per-task spinlock.
+                //
+                // WHY THE LOCK IS REQUIRED (device orchestration / AICPU parallel mode):
+                // The orchestrator (Thread 3) runs concurrently with the scheduler threads and
+                // may still be adding consumers to this task's fanout list via
+                // pto2_add_consumer_to_producer().  That function holds fanout_lock while it
+                // (a) checks the completion state and (b) prepends to fanout_head.
+                //
+                // Without the lock here we have a TOCTOU race:
+                //   1. Orch acquires lock, checks state=0 (task still running), plans insert.
+                //   2. Task finishes; we store state=2 (RELEASE) but haven't acquired the lock.
+                //   3. Orch inserts consumer X into fanout_head, releases lock.
+                //   4. We read the OLD fanout_head (before X was inserted) → X is never woken.
+                //
+                // By acquiring the lock AFTER storing state=2 we guarantee mutual exclusion:
+                //   • If Orch holds the lock first  → it writes fanout_head → we read it with X.
+                //   • If we acquire the lock first  → Orch's subsequent lock-acquire sees state=2
+                //     via the release/acquire pair and takes the early-return path, directly
+                //     incrementing X's fanin_refcount instead of touching fanout_head.
+                // Either way every consumer is accounted for exactly once.
                 __atomic_store_n(&s_pto2_task_completed[task_id & window_mask], 2, __ATOMIC_RELEASE);
-                int32_t fanout_head = pto2_task->fanout_head;
-                PTO2_STORE_RELEASE(&pto2_task->fanout_lock, 0);
-
-                // Traverse fanout outside lock
+                pto2_fanout_lock(pto2_task);
+                int32_t fanout_head = (int32_t)pto2_task->fanout_head;
+                pto2_fanout_unlock(pto2_task);
+
+                // Traverse fanout (no lock)
+                //
+                // SEQ_CST on the refcount increment and fanin_count load breaks the IRIW
+                // (Independent Reads of Independent Writes) hazard with the orchestrator's
+                // Step 5 / Step 5b:
+                //
+                //   Thread 0 (here):           Thread 3 (orchestrator Step 5/5b):
+                //     fetch_add(refcount, SEQ_CST)   store(fanin_count=N, SEQ_CST)
+                //     load(fanin_count,  SEQ_CST)    load(refcount,       SEQ_CST)
+                //
+                // On ARM (IRIW is architecturally allowed with ACQ/REL), both threads could
+                // simultaneously read stale values — this thread sees fanin_count=0 and Step 5b
+                // sees refcount<N — leaving the consumer stuck forever.
+                //
+                // With SEQ_CST, C++ guarantees a single total order over all SEQ_CST ops.
+                // In any ordering the two writes fall, one of the two reads will observe the
+                // other thread's write, ensuring the consumer is enqueued exactly once.
                 int32_t fanout_len = 0;
                 int32_t current = fanout_head;
                 while (current > 0) {
@@ -593,21 +562,36 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                     PTO2DepListEntry* entry = &dep_list_pool[current];
                     int32_t consumer_id = entry->task_id;
                     int32_t consumer_slot = consumer_id & window_mask;
-                    int prev = __atomic_fetch_add(&s_pto2_fanin_refcount[consumer_slot], 1, __ATOMIC_ACQ_REL);
+                    int prev = __atomic_fetch_add(&s_pto2_fanin_refcount[consumer_slot], 1, __ATOMIC_SEQ_CST);
                     PTO2TaskDescriptor* consumer_desc = &task_descriptors[consumer_slot];
-                    int32_t fanin_count = __atomic_load_n(&consumer_desc->fanin_count, __ATOMIC_ACQUIRE);
+                    int32_t fanin_count = __atomic_load_n(&consumer_desc->fanin_count, __ATOMIC_SEQ_CST);
                     if (prev + 1 == fanin_count) {
                         __atomic_store_n(&s_pto2_task_completed[consumer_slot], 1, __ATOMIC_RELEASE);
                         int32_t wt = consumer_desc->worker_type;
+                        int my_shard = thread_idx % PTO2_READY_QUEUE_SHARDS;
+#if PTO2_ORCH_PROFILING
+                        uint64_t _l0 = get_sys_cnt_aicpu(), _l1, _l2;
+#endif
                         if (wt == PTO2_WORKER_CUBE) {
-                            ready_queue_aic_lock_.lock();
-                            ready_queue_aic_[ready_queue_aic_tail_++ & AICPU_READY_MASK] = consumer_id;
-                            ready_queue_aic_lock_.unlock();
+                            ready_queue_aic_lock_[my_shard].lock();
+#if PTO2_ORCH_PROFILING
+                            _l1 = get_sys_cnt_aicpu();
+#endif
+                            ready_queue_aic_[my_shard][ready_queue_aic_tail_[my_shard]++ & AICPU_READY_MASK] = consumer_id;
+                            ready_queue_aic_lock_[my_shard].unlock();
                         } else {
-                            ready_queue_aiv_lock_.lock();
-                            ready_queue_aiv_[ready_queue_aiv_tail_++ & AICPU_READY_MASK] = consumer_id;
-                            ready_queue_aiv_lock_.unlock();
+                            ready_queue_aiv_lock_[my_shard].lock();
+#if PTO2_ORCH_PROFILING
+                            _l1 = get_sys_cnt_aicpu();
+#endif
+                            ready_queue_aiv_[my_shard][ready_queue_aiv_tail_[my_shard]++ & AICPU_READY_MASK] = consumer_id;
+                            ready_queue_aiv_lock_[my_shard].unlock();
                         }
+#if PTO2_ORCH_PROFILING
+                        _l2 = get_sys_cnt_aicpu();
+                        sched_complete_ready_wait += (_l1 - _l0);
+                        sched_complete_ready_hold += (_l2 - _l1);
+#endif
                     }
                     current = entry->next_offset;
                 }
@@ -618,6 +602,14 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                 cur_thread_completed++;
                 made_progress = true;
                 completed_tasks_.fetch_add(1, std::memory_order_release);
+                // Debug: periodic progress (thread 0 only) to find which task hangs
+                if (thread_idx == 0 && task_count > 0) {
+                    int32_t c = completed_tasks_.load(std::memory_order_acquire);
+                    if (c <= 10 || c % 25 == 0 || c == task_count) {
+                        DEV_ALWAYS("PTO2 progress: completed=%d total=%d last_task_id=%d (%.1f%%)",
+                                  c, task_count, task_id, task_count > 0 ? 100.0 * c / task_count : 0.0);
+                    }
+                }
             }
         }
         CYCLE_COUNT_LAP(sched_complete_cycle);
@@ -629,19 +621,55 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                 Handshake* h = &hank[core_id];
                 if (h->task_status == 0 && h->task == 0) {
                     int32_t task_id = -1;
+#if PTO2_ORCH_PROFILING
+                    int this_pop_steal = -1;
+                    uint64_t _l0 = get_sys_cnt_aicpu(), _l1 = _l0, _l2 = _l0;
+#endif
+                    int my_shard = thread_idx % PTO2_READY_QUEUE_SHARDS;
                     if (h->core_type == CoreType::AIC) {
-                        ready_queue_aic_lock_.lock();
-                        if (ready_queue_aic_head_ < ready_queue_aic_tail_) {
-                            task_id = ready_queue_aic_[ready_queue_aic_head_++ & AICPU_READY_MASK];
+                        for (int k = 0; k < PTO2_READY_QUEUE_SHARDS && task_id < 0; k++) {
+                            int shard = (my_shard + k) % PTO2_READY_QUEUE_SHARDS;
+                            ready_queue_aic_lock_[shard].lock();
+                            if (ready_queue_aic_head_[shard] < ready_queue_aic_tail_[shard]) {
+#if PTO2_ORCH_PROFILING
+                                _l1 = get_sys_cnt_aicpu();
+                                this_pop_steal = (k != 0);
+#endif
+                                task_id = ready_queue_aic_[shard][ready_queue_aic_head_[shard]++ & AICPU_READY_MASK];
+                                ready_queue_aic_lock_[shard].unlock();
+#if PTO2_ORCH_PROFILING
+                                _l2 = get_sys_cnt_aicpu();
+#endif
+                                break;
+                            }
+                            ready_queue_aic_lock_[shard].unlock();
                         }
-                        ready_queue_aic_lock_.unlock();
                     } else {
-                        ready_queue_aiv_lock_.lock();
-                        if (ready_queue_aiv_head_ < ready_queue_aiv_tail_) {
-                            task_id = ready_queue_aiv_[ready_queue_aiv_head_++ & AICPU_READY_MASK];
+                        for (int k = 0; k < PTO2_READY_QUEUE_SHARDS && task_id < 0; k++) {
+                            int shard = (my_shard + k) % PTO2_READY_QUEUE_SHARDS;
+                            ready_queue_aiv_lock_[shard].lock();
+                            if (ready_queue_aiv_head_[shard] < ready_queue_aiv_tail_[shard]) {
+#if PTO2_ORCH_PROFILING
+                                _l1 = get_sys_cnt_aicpu();
+                                this_pop_steal = (k != 0);
+#endif
+                                task_id = ready_queue_aiv_[shard][ready_queue_aiv_head_[shard]++ & AICPU_READY_MASK];
+                                ready_queue_aiv_lock_[shard].unlock();
+#if PTO2_ORCH_PROFILING
+                                _l2 = get_sys_cnt_aicpu();
+#endif
+                                break;
+                            }
+                            ready_queue_aiv_lock_[shard].unlock();
                         }
-                        ready_queue_aiv_lock_.unlock();
                     }
+#if PTO2_ORCH_PROFILING
+                    sched_dispatch_ready_wait += (_l1 - _l0);
+                    sched_dispatch_ready_hold += (_l2 - _l1);
+                    if (task_id >= 0 && this_pop_steal >= 0) {
+                        if (this_pop_steal) ready_pop_steal++; else ready_pop_own++;
+                    }
+#endif
                     if (task_id >= 0) {
                         PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask];
                         PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
@@ -657,6 +685,36 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                             core_dispatch_counts_[core_id]++;
                         }
                         h->task_status = 1;
+#ifdef __aarch64__
+                        // AICPU and AICore have separate, non-coherent cache hierarchies.
+                        // AICPU's writes sit in AICPU's cluster L1/L2 cache (write-back)
+                        // until explicitly flushed to HBM. AICore's dcci only invalidates
+                        // AICore's own cache and reads from HBM, so it sees stale values
+                        // if AICPU never flushed.
+                        //
+                        // Three regions must be flushed:
+                        //   1. task->tensor_copies[]: Tensor structs written by Thread 3
+                        //      (orchestrator) during pto2_submit_task. For recently-submitted
+                        //      tasks (last few batches), these writes are still "hot" in
+                        //      AICPU's cache. AICore reads them via payload->args[i] pointer
+                        //      to get buffer.addr/start_offset; stale HBM = addr 0 = hang.
+                        //   2. PTO2DispatchPayload: written by build_pto2_payload just above.
+                        //   3. Handshake: written by h->task = payload and h->task_status = 1.
+                        //      Use dc civac (clean+invalidate) so Phase 1 re-reads from HBM
+                        //      after AICore writes task_status=0 on completion.
+                        {
+                            // Flush PTO2DispatchPayload (build_pto2_payload writes) to HBM
+                            uintptr_t p0 = (uintptr_t)payload & ~63ULL;
+                            uintptr_t p1 = (uintptr_t)payload + sizeof(PTO2DispatchPayload);
+                            for (uintptr_t addr = p0; addr < p1; addr += 64) {
+                                __asm__ volatile("dc cvac, %0" :: "r"(addr) : "memory");
+                            }
+                            // Flush+Invalidate Handshake so Phase 1 reads come from HBM
+                            __asm__ volatile("dc civac, %0" :: "r"((uintptr_t)h) : "memory");
+                            // Wait for all cache ops to complete before returning
+                            __asm__ volatile("dsb sy" ::: "memory");
+                        }
+#endif
                         cur_thread_tasks_in_flight++;
                         made_progress = true;
                         DEV_DEBUG("Thread %d: Dispatching PTO2 task %d to core %d", thread_idx, task_id, core_id);
@@ -668,8 +726,172 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
         }
         CYCLE_COUNT_LAP(sched_dispatch_cycle);
 
+        // Incremental scan: discover root tasks (fanin_count == 0)
+        {
+            int32_t visible = __atomic_load_n(&header->current_task_index, __ATOMIC_ACQUIRE);
+
+            // Update perf header total_tasks if visible tasks have changed
+            if (profiling_enabled && visible > 0 && visible != last_reported_task_count) {
+                perf_aicpu_update_total_tasks(runtime, static_cast<uint32_t>(visible));
+
+                DEV_INFO("Thread %d: Updated perf total_tasks to %d%s",
+                            thread_idx, visible, orch_done ? " (final)" : "");
+
+                last_reported_task_count = visible;
+            }
+
+            while (true) {
+                int32_t idx = next_scan_index_.load(std::memory_order_acquire);
+                if (idx >= visible) break;
+                if (!next_scan_index_.compare_exchange_weak(idx, idx + 1,
+                        std::memory_order_acq_rel, std::memory_order_acquire)) continue;
+
+                int32_t slot = idx & window_mask;
+
+                PTO2TaskDescriptor* t = &task_descriptors[slot];
+                int32_t fanin_count = __atomic_load_n(&t->fanin_count, __ATOMIC_ACQUIRE);
+                if (fanin_count == 0) {
+                    // Mark as enqueued (state=1) to prevent double-enqueue
+                    __atomic_store_n(&s_pto2_task_completed[slot], 1, __ATOMIC_RELEASE);
+                    int32_t wt = t->worker_type;
+                    int my_shard = thread_idx % PTO2_READY_QUEUE_SHARDS;
+#if PTO2_ORCH_PROFILING
+                    uint64_t _l0 = get_sys_cnt_aicpu(), _l1, _l2;
+#endif
+                    if (wt == PTO2_WORKER_CUBE) {
+                        ready_queue_aic_lock_[my_shard].lock();
+#if PTO2_ORCH_PROFILING
+                        _l1 = get_sys_cnt_aicpu();
+#endif
+                        ready_queue_aic_[my_shard][ready_queue_aic_tail_[my_shard]++ & AICPU_READY_MASK] = idx;
+                        ready_queue_aic_lock_[my_shard].unlock();
+                    } else {
+                        ready_queue_aiv_lock_[my_shard].lock();
+#if PTO2_ORCH_PROFILING
+                        _l1 = get_sys_cnt_aicpu();
+#endif
+                        ready_queue_aiv_[my_shard][ready_queue_aiv_tail_[my_shard]++ & AICPU_READY_MASK] = idx;
+                        ready_queue_aiv_lock_[my_shard].unlock();
+                    }
+#if PTO2_ORCH_PROFILING
+                    _l2 = get_sys_cnt_aicpu();
+                    sched_scan_ready_wait += (_l1 - _l0);
+                    sched_scan_ready_hold += (_l2 - _l1);
+#endif
+                    made_progress = true;
+                }
+            }
+        }
+        CYCLE_COUNT_LAP(sched_scan_cycle);
+
+        // Drain orchestrator ready queue: tasks made ready by orchestrator's early-return path
+        // (producer already completed → refcount incremented directly, consumer pushed to queue)
+        if (orch_ready_queue_ != nullptr) {
+            while (true) {
+                int32_t head = __atomic_load_n(orch_ready_head_, __ATOMIC_ACQUIRE);
+                int32_t tail = __atomic_load_n(orch_ready_tail_, __ATOMIC_ACQUIRE);
+                if (head == tail) break;  // queue empty
+
+                // CAS to claim this slot (multiple scheduler threads compete)
+                if (!__atomic_compare_exchange_n(orch_ready_head_, &head, head + 1,
+                        false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) continue;
+
+                int32_t task_id = orch_ready_queue_[head & (orch_ready_capacity_ - 1)];
+                int32_t slot = task_id & window_mask;
+
+                // CAS from 0 → 1 to claim enqueue rights (may already be enqueued by fanout path)
+                int32_t expected = 0;
+                if (!__atomic_compare_exchange_n(&s_pto2_task_completed[slot], &expected, 1,
+                        false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) continue;
+
+                PTO2TaskDescriptor* t = &task_descriptors[slot];
+                int32_t wt = t->worker_type;
+                int my_shard = thread_idx % PTO2_READY_QUEUE_SHARDS;
+#if PTO2_ORCH_PROFILING
+                uint64_t _l0 = get_sys_cnt_aicpu(), _l1, _l2;
+#endif
+                if (wt == PTO2_WORKER_CUBE) {
+                    ready_queue_aic_lock_[my_shard].lock();
+#if PTO2_ORCH_PROFILING
+                    _l1 = get_sys_cnt_aicpu();
+#endif
+                    ready_queue_aic_[my_shard][ready_queue_aic_tail_[my_shard]++ & AICPU_READY_MASK] = task_id;
+                    ready_queue_aic_lock_[my_shard].unlock();
+                } else {
+                    ready_queue_aiv_lock_[my_shard].lock();
+#if PTO2_ORCH_PROFILING
+                    _l1 = get_sys_cnt_aicpu();
+#endif
+                    ready_queue_aiv_[my_shard][ready_queue_aiv_tail_[my_shard]++ & AICPU_READY_MASK] = task_id;
+                    ready_queue_aiv_lock_[my_shard].unlock();
+                }
+#if PTO2_ORCH_PROFILING
+                _l2 = get_sys_cnt_aicpu();
+                sched_orch_ready_wait += (_l1 - _l0);
+                sched_orch_ready_hold += (_l2 - _l1);
+#endif
+                made_progress = true;
+            }
+        }
+        CYCLE_COUNT_LAP(sched_orch_drain_cycle);
+
         if (!made_progress) {
             idle_iterations++;
+            if (thread_idx == 0 && task_count > 0 && idle_iterations % STALL_LOG_INTERVAL == 0 && idle_iterations <= WARN_INTERVAL) {
+                int32_t c = completed_tasks_.load(std::memory_order_acquire);
+                DEV_ALWAYS("PTO2 stall: no progress for %d iterations, completed=%d total=%d",
+                           idle_iterations, c, task_count);
+                // Scan all task slots to find truly stuck tasks
+                // state=0: not yet completed (may be waiting for deps or ready but not enqueued)
+                // state=1: enqueued in ready queue or dispatched to hardware
+                // state=2: completed by Phase 1
+                static const char* knames[] = {"QK","SOFTMAX_PREPARE","PV","ONLINE_UPDATE","AIC_HUB","AIV_HUB"};
+                int cnt_ready = 0, cnt_waiting = 0, cnt_inflight = 0;
+                for (int si = 0; si < task_count; si++) {
+                    int32_t st  = __atomic_load_n(&s_pto2_task_completed[si], __ATOMIC_SEQ_CST);
+                    int32_t rc  = __atomic_load_n(&s_pto2_fanin_refcount[si],  __ATOMIC_SEQ_CST);
+                    int32_t fi  = __atomic_load_n(&task_descriptors[si].fanin_count, __ATOMIC_SEQ_CST);
+                    int32_t kid = task_descriptors[si].kernel_id;
+                    const char* kn = (kid >= 0 && kid <= 5) ? knames[kid] : "?";
+                    if (st == 2) continue; // Already done
+                    if (st == 1) { cnt_inflight++; continue; }
+                    // st == 0
+                    if (rc >= fi) {
+                        // Ready (all deps satisfied) but not enqueued — this is the real bug
+                        cnt_ready++;
+                        if (cnt_ready <= 8) {
+                            DEV_ALWAYS("  STUCK-READY  slot=%d kernel=%s refcount=%d fanin=%d",
+                                       si, kn, rc, fi);
+                        }
+                    } else {
+                        cnt_waiting++;
+                        if (cnt_waiting <= 4) {
+                            DEV_ALWAYS("  STUCK-WAIT   slot=%d kernel=%s refcount=%d fanin=%d",
+                                       si, kn, rc, fi);
+                        }
+                    }
+                }
+                DEV_ALWAYS("  scan result: stuck_ready=%d stuck_waiting=%d in_flight=%d",
+                           cnt_ready, cnt_waiting, cnt_inflight);
+                // Log this thread's dispatch state
+                DEV_ALWAYS("  thread=%d cur_in_flight=%d core_num=%d",
+                           thread_idx, cur_thread_tasks_in_flight, core_num);
+                for (int ci = 0; ci < core_num && ci < 8; ci++) {
+                    int cid = cur_thread_cores[ci];
+                    Handshake* hh = &hank[cid];
+                    int32_t hw_task_id = -1;
+                    int32_t hw_kernel = -1;
+                    if (hh->task != 0) {
+                        const PTO2DispatchPayload* pl = reinterpret_cast<const PTO2DispatchPayload*>((uintptr_t)hh->task);
+                        hw_task_id = pl->task_id;
+                        hw_kernel  = pl->kernel_id;
+                    }
+                    const char* hkn = (hw_kernel >= 0 && hw_kernel <= 5) ?
+                        knames[hw_kernel] : "none";
+                    DEV_ALWAYS("    core=%d status=%d task_id=%d kernel=%s",
+                               cid, (int)hh->task_status, hw_task_id, hkn);
+                }
+            }
             if (idle_iterations % WARN_INTERVAL == 0) {
                 DEV_WARN("Thread %d: PTO2 %d idle iterations, %d/%d completed",
                         thread_idx, idle_iterations, completed_tasks_.load(std::memory_order_acquire), task_count);
@@ -720,6 +942,23 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
         (unsigned long long)total_fanout_traversed,
         max_fanout_len,
         cur_thread_completed > 0 ? (double)total_fanout_traversed / cur_thread_completed : 0.0);
+    DEV_ALWAYS("Thread %d:   lock(ready_q): wait=%.0fus hold=%.0fus (scan=%.0f/%.0f orch=%.0f/%.0f complete=%.0f/%.0f dispatch=%.0f/%.0f)",
+        thread_idx,
+        (double)cycles_to_us(sched_scan_ready_wait + sched_orch_ready_wait + sched_complete_ready_wait + sched_dispatch_ready_wait),
+        (double)cycles_to_us(sched_scan_ready_hold + sched_orch_ready_hold + sched_complete_ready_hold + sched_dispatch_ready_hold),
+        (double)cycles_to_us(sched_scan_ready_wait), (double)cycles_to_us(sched_scan_ready_hold),
+        (double)cycles_to_us(sched_orch_ready_wait), (double)cycles_to_us(sched_orch_ready_hold),
+        (double)cycles_to_us(sched_complete_ready_wait), (double)cycles_to_us(sched_complete_ready_hold),
+        (double)cycles_to_us(sched_dispatch_ready_wait), (double)cycles_to_us(sched_dispatch_ready_hold));
+    DEV_ALWAYS("Thread %d:   ready_q pop: own=%llu steal=%llu total=%llu steal_pct=%.1f%%",
+        thread_idx,
+        (unsigned long long)ready_pop_own, (unsigned long long)ready_pop_steal,
+        (unsigned long long)(ready_pop_own + ready_pop_steal),
+        (ready_pop_own + ready_pop_steal) > 0 ? 100.0 * (double)ready_pop_steal / (double)(ready_pop_own + ready_pop_steal) : 0.0);
+    DEV_ALWAYS("Thread %d:   lock(fanout): spin=%.0fus hold=%.0fus",
+        thread_idx,
+        (double)cycles_to_us(sched_complete_fanout_spin),
+        (double)cycles_to_us(sched_complete_fanout_hold));
 
     DEV_ALWAYS("Thread %d: PTO2 execution complete, completed %d tasks", thread_idx, cur_thread_completed);
 #endif
@@ -950,7 +1189,7 @@ int AicpuExecutor::run(Runtime* runtime) {
             // Device mode: task count lives in PTO2 shared memory (current_task_index at offset 0)
             void* sm = runtime->get_pto2_gm_sm_ptr();
             int32_t pto2_task_count = sm ? *(volatile int32_t*)sm : 0;
-            DEV_INFO("Thread 3: PTO2 task count = %d", pto2_task_count);
+            DEV_ALWAYS("PTO2 total submitted tasks = %d", pto2_task_count);
             total_tasks_.store(pto2_task_count, std::memory_order_release);
             orchestrator_done_.store(true, std::memory_order_release);
             DEV_INFO("Thread 3: Set orchestrator_done=true");
@@ -983,10 +1222,12 @@ int AicpuExecutor::run(Runtime* runtime) {
 
 void AicpuExecutor::deinit() {
     // Cleanup runtime execution state
-    ready_queue_aic_head_ = 0;
-    ready_queue_aic_tail_ = 0;
-    ready_queue_aiv_head_ = 0;
-    ready_queue_aiv_tail_ = 0;
+    for (int s = 0; s < PTO2_READY_QUEUE_SHARDS; s++) {
+        ready_queue_aic_head_[s] = 0;
+        ready_queue_aic_tail_[s] = 0;
+        ready_queue_aiv_head_[s] = 0;
+        ready_queue_aiv_tail_[s] = 0;
+    }
 
     // Reset per-core dispatch timestamps and task counters
     for (int i = 0; i < RUNTIME_MAX_WORKER; i++) {
@@ -1029,9 +1270,12 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int thread_idx,
     DEV_ALWAYS("Progress: %d/%d tasks (%.1f%%)",
              completed, total, total > 0 ? completed * 100.0 / total : 0.0);
 
-    int aic_ready = ready_queue_aic_tail_ - ready_queue_aic_head_;
-    int aiv_ready = ready_queue_aiv_tail_ - ready_queue_aiv_head_;
-    DEV_ALWAYS("Ready Queues: AIC=%d, AIV=%d", aic_ready, aiv_ready);
+    int aic_ready = 0, aiv_ready = 0;
+    for (int s = 0; s < PTO2_READY_QUEUE_SHARDS; s++) {
+        aic_ready += ready_queue_aic_tail_[s] - ready_queue_aic_head_[s];
+        aiv_ready += ready_queue_aiv_tail_[s] - ready_queue_aiv_head_[s];
+    }
+    DEV_ALWAYS("Ready Queues (3 shards, per-thread push + work-steal pop): AIC=%d, AIV=%d", aic_ready, aiv_ready);
 
     int busy_cores = 0;
     int idle_cores = 0;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 2d211c838..0fa867ff7 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -43,22 +43,11 @@ static int64_t  g_orch_submit_count = 0;
 #endif
 
 // =============================================================================
-// Per-Task Spinlock Implementation
+// Per-Task Spinlock Implementation (thin wrappers around the header helpers)
 // =============================================================================
 
-/**
- * Acquire spinlock for task's fanout fields
- */
-static inline void task_fanout_lock(PTO2TaskDescriptor* task) {
-    while (PTO2_EXCHANGE(&task->fanout_lock, 1) != 0) {
-        PTO2_SPIN_PAUSE_LIGHT();
-    }
-}
-
-/**
- * Release spinlock for task's fanout fields
- */
-static inline void task_fanout_unlock(PTO2TaskDescriptor* task) { PTO2_STORE_RELEASE(&task->fanout_lock, 0); }
+static inline void task_fanout_lock(PTO2TaskDescriptor* task)   { pto2_fanout_lock(task);   }
+static inline void task_fanout_unlock(PTO2TaskDescriptor* task) { pto2_fanout_unlock(task); }
 
 // =============================================================================
 // Orchestrator Initialization
@@ -300,9 +289,11 @@ void pto2_submit_task(PTO2OrchestratorState* orch,
     int32_t fanin_count = 0;
 
     task->param_count = num_params;
-    // Bulk copy all params at once
+    // Bulk copy param descriptors (type, tensor pointer, scalar); no tensor buffer content is copied.
     memcpy(task->params, params, num_params * sizeof(PTOParam));
-    // Copy tensor data into task-owned storage; redirect pointers
+    // Copy only Tensor *descriptors* (metadata: addr, size, strides, shape) into task-owned storage;
+    // redirect task->params[i].tensor to point to task->tensor_copies[i]. No allocation here;
+    // output buffer allocation happens in Step 3, and we write back buffer.addr to the caller's tensor.
     for (int i = 0; i < num_params; i++) {
         if (params[i].tensor) {
             task->tensor_copies[i] = *params[i].tensor;
@@ -362,6 +353,8 @@ void pto2_submit_task(PTO2OrchestratorState* orch,
             }
 
             case PTOParamType::OUTPUT: {
+                // OUTPUT must have a non-null tensor (descriptor for shape/size); no allocation in make_tensor.
+                assert(params[i].tensor && "OUTPUT param must have a non-NULL tensor descriptor");
                 // Only allocate from ring buffer when caller did not provide an address
                 if (params[i].tensor->buffer.addr == 0) {
                     total_output_size += PTO2_ALIGN_UP(params[i].tensor->buffer.size, PTO2_PACKED_OUTPUT_ALIGN);
@@ -382,13 +375,14 @@ void pto2_submit_task(PTO2OrchestratorState* orch,
         task->packed_buffer_end = (char*)task->packed_buffer_base + total_output_size;
 
         // Offsets: each output at 1024B-aligned slot; slot size = ALIGN_UP(size, 1024)
+        // Allocation happens here only; no memcpy of buffer content. Caller's tensor gets addr written back.
         int32_t offset = 0;
         for (int i = 0; i < task->param_count; i++) {
             if (task->params[i].type == PTOParamType::OUTPUT) {
                 if (task->tensor_copies[i].buffer.addr == 0) {
                     uint64_t alloc_addr = reinterpret_cast<uint64_t>((char*)task->packed_buffer_base + offset);
                     task->tensor_copies[i].buffer.addr = alloc_addr;
-                    // Write back through caller's pointer (implicit update)
+                    // Write back to caller's tensor so orchestration stack sees the allocated address (no copy)
                     params[i].tensor->buffer.addr = alloc_addr;
                     offset += PTO2_ALIGN_UP(task->tensor_copies[i].buffer.size, PTO2_PACKED_OUTPUT_ALIGN);
                 }
@@ -414,13 +408,31 @@ void pto2_submit_task(PTO2OrchestratorState* orch,
 
     CYCLE_COUNT_LAP(g_orch_insert_cycle);
 
+#ifdef __aarch64__
+    // Flush tensor_copies[] to HBM so AICore (which reads from HBM via dcci)
+    // sees correct Tensor metadata (buffer.addr, start_offset, strides, repeats).
+    // Done here in the orchestrator (Thread 3) rather than in the scheduler's
+    // dispatch path to avoid inflating Tail OH and triggering timing-dependent
+    // dependency resolution races.
+    {
+        uintptr_t tc0 = (uintptr_t)task->tensor_copies & ~63ULL;
+        uintptr_t tc1 = (uintptr_t)(task->tensor_copies + num_params);
+        for (uintptr_t addr = tc0; addr < tc1; addr += 64) {
+            __asm__ volatile("dc cvac, %0" :: "r"(addr) : "memory");
+        }
+        __asm__ volatile("dsb sy" ::: "memory");
+    }
+#endif
+
     // === STEP 5: Finalize fanin list ===
     // First build the fanin list
     for (int i = 0; i < fanin_count; i++) {
         task->fanin_head = pto2_dep_list_prepend(&orch->dep_pool, task->fanin_head, fanin_temp[i]);
     }
-    // Use release semantics to ensure fanin list is visible before fanin_count
-    __atomic_store_n(&task->fanin_count, fanin_count, __ATOMIC_RELEASE);
+    // SEQ_CST store: participates in the global total order with Phase 1's SEQ_CST
+    // fetch_add on s_pto2_fanin_refcount to prevent the IRIW hazard on ARM.
+    // (See comment above the fetch_add in aicpu_executor.cpp Phase 1 for details.)
+    __atomic_store_n(&task->fanin_count, fanin_count, __ATOMIC_SEQ_CST);
 
     CYCLE_COUNT_LAP(g_orch_fanin_cycle);
 
@@ -431,7 +443,7 @@ void pto2_submit_task(PTO2OrchestratorState* orch,
     // ready queue so scheduler threads can pick it up without an O(N) scan.
     if (orch->aicpu_fanin_refcount && fanin_count > 0) {
         int32_t slot = task_id & orch->aicpu_window_mask;
-        int32_t refcount = __atomic_load_n(&orch->aicpu_fanin_refcount[slot], __ATOMIC_ACQUIRE);
+        int32_t refcount = __atomic_load_n(&orch->aicpu_fanin_refcount[slot], __ATOMIC_SEQ_CST);
         if (refcount >= fanin_count) {
             // All producers already completed — push to orch ready queue
             int32_t tail = orch->orch_ready_tail;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index cdd9ddd21..34e70b579 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -354,4 +354,26 @@ typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
 #define PTO2_EXCHANGE(ptr, val) \
     __atomic_exchange_n(ptr, val, __ATOMIC_ACQ_REL)
 
+// =============================================================================
+// Per-task fanout spinlock helpers
+//
+// Used by BOTH the orchestrator (pto_orchestrator.cpp) and the scheduler
+// (aicpu_executor.cpp). Placing them here ensures both translation units use
+// identical acquire/release semantics.
+//
+// The fanout_lock MUST be held whenever reading or writing fanout_head /
+// fanout_count, because the orchestrator adds consumers concurrently with the
+// scheduler traversing the list after task completion.
+// =============================================================================
+
+static inline void pto2_fanout_lock(PTO2TaskDescriptor* task) {
+    while (PTO2_EXCHANGE(&task->fanout_lock, 1) != 0) {
+        PTO2_SPIN_PAUSE_LIGHT();
+    }
+}
+
+static inline void pto2_fanout_unlock(PTO2TaskDescriptor* task) {
+    PTO2_STORE_RELEASE(&task->fanout_lock, 0);
+}
+
 #endif // PTO_RUNTIME2_TYPES_H
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
index 799c5a2aa..59fa3d87d 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
@@ -41,8 +41,9 @@ enum class PTOParamType : int32_t {
  * automatic dependency detection via TensorMap overlap checking.
  *
  * For OUTPUT params with tensor->buffer.addr == 0, the runtime allocates
- * a buffer and writes the address back through the pointer, implicitly
- * updating the caller's local Tensor. No manual sync needed.
+ * from the heap ring in pto2_submit_task (not in make_tensor) and writes the
+ * address back through the pointer. No buffer content is copied; input/inout
+ * tensors already point to their storage, so no memcpy on submit.
  *
  * Example:
  *   Tensor td_a = make_tensor_external(dev_a, size);
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/tensor.h b/src/runtime/tensormap_and_ringbuffer/runtime/tensor.h
index 7d061b698..a60de2afb 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/tensor.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/tensor.h
@@ -251,7 +251,9 @@ static inline Tensor make_tensor_external(
 
 /**
  * Create a Tensor for runtime-allocated output (addr=0).
- * The runtime fills in the actual address during pto2_submit_task.
+ * NO memory allocation: only records dtype, shape, and buffer.size in the Tensor struct.
+ * The runtime allocates from the heap ring and fills buffer.addr during pto2_submit_task
+ * when this tensor is passed as OUTPUT param. No buffer content is ever copied.
  */
 static inline Tensor make_tensor(uint64_t size_bytes, DataType dtype = DataType::FLOAT32, int32_t version = 0) {
     return Tensor::make_1d_contiguous(0, size_bytes, dtype, version);
@@ -259,7 +261,9 @@ static inline Tensor make_tensor(uint64_t size_bytes, DataType dtype = DataType:
 
 /**
  * Create a Tensor for runtime-allocated output (addr=0).
- * The runtime fills in the actual address during pto2_submit_task.
+ * NO memory allocation: only records dtype, shape, and buffer.size in the Tensor struct.
+ * The runtime allocates from the heap ring and fills buffer.addr during pto2_submit_task
+ * when this tensor is passed as OUTPUT param. No buffer content is ever copied.
  */
 static inline Tensor make_tensor(
     const uint64_t shapes[], uint64_t ndims, DataType dtype = DataType::FLOAT32, int32_t version = 0) {

From 6a119d031743ba21d3a5f093f6e3e86b0769f24d Mon Sep 17 00:00:00 2001
From: liaoheng <hengliao1972@163.com>
Date: Thu, 26 Feb 2026 12:15:36 +0800
Subject: [PATCH 2/6] Enhance: paged attention example with multi-batch test
 cases

- Add CaseBatch2/4/8/16 test cases with varying batch sizes
- Clean up kernel code: remove unused printf, fix pipe barriers
- Add TROUBLESHOOTING.md documenting known issues and fixes
---
 .../paged_attention/golden.py                 |  18 +
 .../paged_attention/TROUBLESHOOTING.md        | 350 ++++++++++++++++++
 .../kernels/aic/aic_pv_matmul.cpp             |   1 -
 .../kernels/aic/aic_qk_matmul.cpp             |   1 -
 .../kernels/aiv/aiv_online_update.cpp         |   7 +-
 .../orchestration/paged_attention_orch.cpp    |   6 +-
 6 files changed, 376 insertions(+), 7 deletions(-)
 create mode 100644 tests/device_tests/tensormap_and_ringbuffer/paged_attention/TROUBLESHOOTING.md

diff --git a/examples/tensormap_and_ringbuffer/paged_attention/golden.py b/examples/tensormap_and_ringbuffer/paged_attention/golden.py
index cb02beb14..41e56ffbe 100644
--- a/examples/tensormap_and_ringbuffer/paged_attention/golden.py
+++ b/examples/tensormap_and_ringbuffer/paged_attention/golden.py
@@ -43,6 +43,24 @@
         "context_len": 128,
         "max_model_len": 256,
     },
+    "CaseBatch2": {
+        "batch": 2,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "CaseBatch4": {
+        "batch": 4,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
 }
 
 # Select case by env var PA_CASE, default to Case1
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/TROUBLESHOOTING.md b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/TROUBLESHOOTING.md
new file mode 100644
index 000000000..74b704ff3
--- /dev/null
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/TROUBLESHOOTING.md
@@ -0,0 +1,350 @@
+# Paged Attention 测试排错说明
+
+## 1. 间歇性测试失败（有时 PASS，有时 853/131072 或 1652/131072 不匹配）
+
+### 可能原因
+
+- **浮点非确定性**  
+  Golden 在 host 上按固定顺序做 online softmax；device 上同一 (batch, head) 的多个 block 通过依赖串行执行，但各 kernel 内部或不同 core 间的浮点运算顺序、舍入可能略有差异，导致边界元素在 `rtol=1e-3, atol=1e-3` 下偶发不匹配。
+
+- **调度/时序**  
+  依赖关系由 orchestrator 的 INOUT 链正确构建（UP(bn) 依赖 UP(bn-1)），理论上执行顺序确定。若仍出现间歇性错误，需排查：  
+  - 完成信号与 GM 写回顺序：AICore 在置 `task_status` 前是否已保证对 GM 的写对后续 task 可见；  
+  - 是否存在极少数路径下 fanin 未满足即被调度（需依赖与完成逻辑的审计）。
+
+### 建议
+
+- 多跑几次用例，若仅偶发 1 次失败，多为浮点或环境波动，可暂时放宽 tolerance 或接受偶发差异。  
+- 若需严格可复现：可考虑在 device 上对同一 (batch, head) 的 online update 使用更确定的归约顺序或同一 core 串行化（会牺牲性能）。  
+- 定位时可在脚本中循环跑 N 次并统计失败率，或临时关闭 work stealing 观察是否仍失败。
+
+---
+
+## 2. rtFree failed: 507899（finalize 阶段）
+
+### 原因
+
+CANN 在 **stream 已 destroy 之后** 再对 device 内存调用 `rtFree` 时，可能返回 507899（或类似错误）。原先 `DeviceRunner::finalize()` 的顺序是：先 `rtStreamDestroy`，再 `perf_collector_.finalize()` 和 `mem_alloc_.finalize()`，导致在无有效 stream 的情况下执行 rtFree。
+
+### 修复
+
+已在 `src/platform/a2a3/host/device_runner.cpp` 中调整顺序：
+
+1. 先执行所有通过 `mem_alloc_` 的释放：  
+   `kernel_args_.finalize_runtime_args()`, `finalize_device_args()`, `so_info_.finalize()`  
+2. 再执行 `perf_collector_.finalize()` 和 `mem_alloc_.finalize()`（释放 perf 缓冲区和剩余 kernel/reg 等分配）  
+3. **最后** 再 `rtStreamDestroy(stream_aicpu_)` 和 `rtStreamDestroy(stream_aicore_)`
+
+这样所有 rtFree 都在 stream 仍存在时完成，可避免 507899。
+
+此外在 finalize 开头增加了 `rtDeviceSynchronize(device_id)`，确保所有设备操作（包括可能的异步拷贝）完成后再释放，进一步降低 507899 出现概率。
+
+### 若仍出现 507899
+
+- **确认用的是新 host**：每次执行 `run_example.py` 都会在临时目录完整重编 host（含 `device_runner.cpp`），无需单独执行 `setup.py`。若你改过 `device_runner.cpp`，直接再跑一次用例即可加载新 so。
+- 若重跑后仍报 507899：可查 CANN 文档该错误码含义。当前代码已将 507899 记为 WARN（CANN teardown 已知现象），不再以 ERROR 报出。
+
+### 仅 enable profiling 时出现 507899
+
+原因：profiling 使用的 device 内存在分配后经 **halHostRegister** 做了 host 映射。CANN 的 **halHostUnregister** 在解除映射时可能已释放该 device 内存，若再对该指针调用 **rtFree** 会返回 507899。
+
+修复：在 `device_runner.cpp` 的 finalize 中，对 perf 资源传入的 `free_cb` 改为只做 **untrack**（从 allocator 跟踪中移除），不再对该指针调用 `rtFree`。同时为 `MemoryAllocator` 增加 **untrack(ptr)**，仅从 `ptr_set_` 移除不释放。这样 unregister 后不再对该块调用 rtFree，507899 在开启 profiling 时也应消失。
+
+---
+
+## 3. Performance data collection idle timeout（0 / N records）
+
+### 现象
+
+开启 `--enable-profiling` 时出现：
+
+- `poll_and_collect: Performance data collection idle timeout after 30 seconds`
+- `Collected 0 / 16704 records before timeout`
+- `Total buffers processed: 0`
+
+即 Host 在等 AICPU 往 perf 队列入队 buffer，超时前一直没有新数据。
+
+### 可能原因
+
+- **设备卡住或未真正跑完**：AICore 未完成任务，或设备 14 被其他进程占用，导致 AICPU 从未向 host 可见的队列入队任何 buffer。
+- **偶发时序/负载**：同一命令有时能收满 16704 条记录，有时超时，多为环境或负载偶发。
+
+### 建议
+
+- **先确认不带 profiling 是否通过**：去掉 `--enable-profiling` 再跑，若用例 PASS，说明计算正确，问题仅在 profiling 采集。
+- **直接重跑**：多次运行同一命令，若多数时候能收齐记录，可视为偶发，必要时稍后重试或换设备。
+- 若需稳定拿 perf 数据：可在设备空闲时单独跑带 profiling 的用例，或排查是否有其他进程占用同一 NPU。
+
+---
+
+## 4. 为何 enable profiling 时会“卡住”或像死机？
+
+### 执行顺序（enable profiling 时）
+
+1. Host 分配一块 **device 上的 perf 共享内存**，并用 `halHostRegister` 映射到 host，使 host 与 AICPU 都能访问。
+2. 把 `runtime.perf_data_base` 设为该块地址，并随 runtime 拷贝到设备；AICPU 侧用该地址写每条任务完成后的 perf 记录。
+3. **Launch**：依次下发 AICPU Init、AICPU Main（DynTileFwkKernelServer）、AICore kernel。
+4. **紧接着**（在 `rtStreamSynchronize` 之前）Host 调用 **`poll_and_collect`**：在这里 **轮询** 读 perf 共享区里的队列（`queue_heads` / `queue_tails`），每收到 AICPU 入队的一个 buffer 就处理其中的记录；直到 **已收集条数 ≥ expected_tasks** 或 **连续 30 秒没有新 buffer** 才返回。
+5. 之后才执行 `rtStreamSynchronize`、copy-back、比对结果。
+
+因此：**只要设备没有把 perf 数据写入队列，Host 就会一直停在 `poll_and_collect` 里**，看起来像“卡住”。
+
+### “卡住”的两种含义
+
+| 情况 | 表现 | 原因 |
+|------|------|------|
+| **进程卡住约 30 秒后继续** | 无新 buffer 时轮询满 30 秒，然后打印 idle timeout、0 records，接着继续跑完（可能结果仍 PASS） | 设备侧没写入 perf 队列：AICore 未完成任务、设备被占用、或 AICPU 未正确写 total_tasks/入队。Host 只是阻塞等待，并非真死机。 |
+| **整机死机/无响应** | 机器完全卡死，只能断电或强制重启 | 少见：可能为 NPU 驱动在 **profiling 路径**（如 `halHostRegister`、大量 host 可见的 device 内存访问）下的 bug，或设备/驱动在特定负载下挂死。 |
+
+### 建议
+
+- **多数是第一种**：等满 30 秒会超时退出，属“假死”；可先去掉 `--enable-profiling` 确认用例能稳定 PASS，再在设备空闲时开 profiling 多跑几次。
+- **若是整机死机**：尝试升级 CANN/驱动；或暂时不用 `--enable-profiling` 规避；若可稳定复现，需带环境信息向 CANN/设备侧反馈（profiling + halHostRegister 场景）。
+
+---
+
+## 5. Enable profiling 时输出与 golden 不一致（不带 profiling 则 PASS）
+
+### 现象
+
+- 不带 `--enable-profiling`：用例稳定 **PASS**，无 507899。
+- 带 `--enable-profiling`：**TEST FAILED**，如 `Mismatched elements: 750/131072` 或 `1528/131072`，且每次跑不匹配数量可能不同；应用 untrack 修复后 finalize 不再报 507899。
+
+### 可能原因
+
+- **与第 1 节叠加**：profiling 开启后多了一块 host 可访问的 device 内存（halHostRegister）、以及 AICPU 侧写 perf 的额外逻辑，可能改变设备侧内存访问顺序或时序，使原本就存在的浮点非确定性或偶发可见性更易暴露，表现为“带 profiling 更容易不匹配”。
+- **Profiling 路径影响主路径**：设备上写 perf 记录、切换 buffer、更新 total_tasks 等与主计算并发，在极端时序下可能影响主计算（例如总线/缓存、或与 GM 的可见性），导致输出与 golden 不一致。
+
+### 建议
+
+- **以不带 profiling 的 PASS 为准**：若不带 `--enable-profiling` 稳定 PASS，可认为主计算逻辑正确；带 profiling 的失败可先视为 profiling 与主路径的交互/时序问题。
+- **需要 perf 数据时**：多跑几次带 profiling 的用例，有时会 PASS；或先跑不带 profiling 做正确性回归，需要 swimlane 时再单独跑带 profiling 并接受偶发不匹配。
+- **根因排查**：可在 AICPU 侧临时关闭写 perf（或延后写）、或调整 poll_and_collect 与 rtStreamSynchronize 的顺序做对比，确认是否与“host 读 perf 与 device 写主结果”的并发有关。
+
+---
+
+## 5.1 检查 device 0 状态与是否有进程占用
+
+### 查看 device 0 状态（npu-smi）
+
+```bash
+# 设备用量（HBM、AICore/AIV/AICPU 占用率等）
+npu-smi info -t usages -i 0
+
+# 设备概要（温度、功耗、AICore 数量等）
+npu-smi info -t common -i 0
+
+# 内存信息
+npu-smi info -t memory -i 0
+```
+
+若 **Aicore Usage Rate / Aicpu Usage Rate** 等长期为 0，且无业务在跑，可认为设备空闲。
+
+### 查看是否有进程占用 device 0
+
+部分环境不支持 `npu-smi info proc -i 0`，可用下面方式辅助判断：
+
+```bash
+# 查看是否有进程打开 /dev/davinci0（device 0）
+fuser -v /dev/davinci0
+# 或
+lsof /dev/davinci0
+```
+
+无输出则当前没有进程占用 device 0。
+
+```bash
+# 查看是否还有 run_example / paged_attention 在跑（按需改 -d 的 device）
+ps aux | grep -E "run_example|paged_attention" | grep -v grep
+```
+
+若有卡住的用例，可 `kill <pid>` 后再复位设备。
+
+### 复位 device 0 后重跑（需 root 执行复位）
+
+复位命令必须由 root 执行（`sudo npu-smi set -t reset -i 0`）。若无 sudo 权限，需联系管理员执行复位；**若 device 0 当前无进程占用且 npu-smi 显示 Aicore/Aicpu 占用率为 0，可直接不复位直接重跑用例**。
+
+```bash
+sudo npu-smi set -t reset -i 0
+sleep 20
+cd /path/to/simpler
+PA_CASE=Case1 python examples/scripts/run_example.py -k ... -g ... -d 0
+```
+
+---
+
+## 6. Device log 位置与 Ready queue 抢锁统计
+
+### 如何获取 device log（a2a3 真机）
+
+AICPU 的 `DEV_ALWAYS` 通过 CANN 的 **dlog** 输出，不会出现在 run_example 的终端里，而是写入 CANN 的 device 日志目录：
+
+- **默认路径**：`$HOME/ascend/log/debug/device-<device_id>/`
+- 每次运行会生成或追加到类似 `device-<pid>_<timestamp>.log` 的文件。
+- 最近一次运行的日志可按时间戳或修改时间找到；也可在运行前设置 `ASCEND_PROCESS_LOG_PATH=/tmp/ascend_log` 将应用类日志写到指定目录（部分 CANN 版本下 device 侧 dlog 仍可能落在 ascend 默认路径）。
+
+查找包含 ready queue 统计的日志行示例：
+
+```bash
+grep -E "ready_q|lock\(ready_q\)|scheduler stats" $HOME/ascend/log/debug/device-14/*.log | tail -80
+```
+
+### 一次运行中的 Ready queue 抢锁统计（示例）
+
+以下来自一次 paged attention 用例（`--enable-profiling`，device 14，约 16704 tasks，3 个调度线程）的 device log 汇总。
+
+**锁级别（每线程）**
+
+| 线程 | 拿锁总次数 locks | 总等待 wait(μs) | 总持锁 hold(μs) | 平均每次等待 avg_wait(μs) | 平均每次持锁 avg_hold(μs) | 分项 wait/hold(μs): scan / orch / complete / dispatch |
+|------|------------------|----------------|-----------------|---------------------------|---------------------------|------------------------------------------------------|
+| 0    | 45804            | 6983           | 1194            | 0.15                      | 0.03                      | 170/274, 0/0, 1207/776, 5605/144                     |
+| 1    | 42824            | 6795           | 1170            | 0.16                      | 0.03                      | 172/241, 0/0, 1117/771, 5507/158                     |
+| 2    | 38990            | 7386           | 1074            | 0.19                      | 0.03                      | 176/212, 0/0, 1272/768, 5938/94                      |
+
+**Push/Pop 级别（每线程）**
+
+| 线程 | push 次数 | push 平均等待(μs) | push 平均持锁(μs) | pop 次数 | pop 平均等待(μs) | pop 平均持锁(μs) | pop 中 steal 占比 |
+|------|-----------|-------------------|-------------------|----------|------------------|------------------|--------------------|
+| 0    | 6049      | 0.23              | 0.17              | 5898     | 0.95             | 0.02             | 31.5%              |
+| 1    | 5733      | 0.22              | 0.18              | 5781     | 0.95             | 0.03             | 33.5%              |
+| 2    | 4922      | 0.29              | 0.20              | 5025     | 1.18             | 0.02             | 38.0%              |
+
+### 简要分析（Ready queue 抢锁）
+
+- **抢锁强度**：平均每次拿锁 **wait ≈ 0.15–0.19 μs**，**hold ≈ 0.03 μs**；单次 push/pop 在 ready queue 上的访问时间约为 **(wait + hold) ≈ 0.18–0.22 μs**，抢锁开销不大。
+- **时间分布**：绝大部分 wait 来自 **dispatch**（pop 路径），少量来自 **complete**（fanout push）；**scan / orch** 的 wait 和 hold 都很小。
+- **Push 与 pop 开销**：单次 push 平均持锁约 **0.17–0.20 μs**，单次 pop 平均持锁约 **0.02–0.03 μs**；pop 的“平均等待”略高是因为 dispatch 路径上多次拿锁的等待被摊到 pop 次数上。
+- **Work stealing**：约 **31–38%** 的 pop 来自偷取，说明 3 个 shard 间负载较均衡。
+
+---
+
+### 调度开销 break-down（Scheduler overhead）
+
+同一份 device log 中，每线程 **scheduler 总时间** ≈ 34.3 ms，由四段相位 + yield 组成。下面按**相位**拆解，并标出其中 **ready queue 锁 (wait+hold)** 占该相位的比例，便于区分“抢锁”与“其它调度逻辑”。
+
+**1）按相位汇总（以 Thread 0 为例，单位 μs）**
+
+| 相位 | 时间(μs) | 占 total 比例 | 其中 ready_q lock (wait+hold) | lock 占该相位比例 |
+|------|----------|----------------|--------------------------------|-------------------|
+| dispatch | 18461.7 | **53.8%** | 5605+144=5749 | 31.2% |
+| complete | 14201.0 | **41.4%** | 1207+776=1983 | 14.0% |
+| scan | 1428.0 | 4.2% | 170+274=444 | 31.1% |
+| orch_drain | 11.6 | 0.0% | 0 | — |
+| yield | 189.6 | 0.6% | — | — |
+| **合计** | **34291.9** | 100% | **8177** | **23.8%** |
+
+Thread 1、2 数值类似：total ≈ 34.3 ms，ready_q lock 合计约 7.9–8.5 ms/线程，占 **总调度时间约 23–25%**。
+
+**2）各相位含义与“非锁”部分在做什么**
+
+- **Dispatch（~54%）**  
+  - 总时间 ~18.4 ms；其中 **~5.7 ms 为 ready queue 锁**（pop 时抢锁），其余 **~12.7 ms** 为：轮询 AIC/AIV core 状态、从 ready 取到 task 后的 resolve、下发 kernel 到 AICore、写 perf 等。
+- **Complete（~41%）**  
+  - 总时间 ~14.2 ms；其中 **~2.0 ms 为 ready queue 锁**（fanout 完成后 push），其余 **~12.2 ms** 为：遍历 fanout 链表、更新 consumer fanin、判断是否 ready、写完成状态等（**lock(fanout)** 本次为 0，无争用）。
+- **Scan（~4%）**  
+  - 总时间 ~1.4 ms；其中 **~0.44 ms 为 ready queue 锁**（扫描时 drain 到 ready），其余为扫描 task 状态、判断是否可入队等。
+- **Orch_drain**  
+  - 可忽略（~10 μs）。
+- **Yield（~0.6%）**  
+  - 本线程无任务可做时 yield，与锁无关。
+
+**3）结论（调度开销 break-down）**
+
+- 单次 ready queue 访问（push/pop）的锁开销 ≈ **0.18–0.22 μs**，抢锁不重。
+- 调度总时间 ~34.3 ms/线程里：
+  - **~23–25%** 是 **ready queue spinlock**（wait+hold）；
+  - **~75–77%** 是 **其它调度逻辑**：dispatch 的轮询+resolve+下发、complete 的 fanout 遍历与更新、scan 的扫描与入队判断等。
+- 若进一步优化调度，可优先看 **dispatch 与 complete 的非锁路径**（轮询方式、fanout 遍历与缓存、resolve/launch 开销），其次才是 ready queue 锁本身。
+
+**4）按每 task 平均（Per-task averages）**
+
+将上述各相位时间除以该线程 **completed** 任务数，得到“每完成一个 task、该线程在调度上平均花费的时间”（单位 μs/task）。同一份 log 数据：
+
+| 线程 | completed | total | dispatch | complete | scan | yield | ready_q lock (wait+hold) |
+|------|-----------|------|----------|----------|------|-------|---------------------------|
+| 0 | 5898 | **5.81** | 3.13 | 2.41 | 0.24 | 0.03 | **1.39** |
+| 1 | 5781 | **5.93** | 3.19 | 2.48 | 0.23 | 0.03 | **1.38** |
+| 2 | 5025 | **6.83** | 3.75 | 2.85 | 0.19 | 0.03 | **1.68** |
+
+即：**每完成 1 个 task，调度侧平均约 5.8–6.8 μs**；其中约 **1.4–1.7 μs** 为 ready queue 锁，约 **3.1–3.8 μs** 为 dispatch（轮询+resolve+下发），约 **2.4–2.9 μs** 为 complete（fanout 遍历+push 等），scan/yield 合计约 0.2–0.3 μs/task。
+
+**5）每 task 平均执行时间与调度开销汇总**
+
+同一次运行中，host 端 **Task Statistics**（swimlane 输出）给出 AICore 上 kernel 的 **Total_Exec**；device log 给出三线程 **scheduler 总时间**。对 16704 个 task 做平均：
+
+| 指标 | 计算 | 数值 |
+|------|------|------|
+| 总 task 数 | — | 16704 |
+| **每 task 平均执行时间**（AICore kernel） | Total_Exec / 16704 | **27849.48 / 16704 ≈ 1.67 μs** |
+| **每 task 平均调度开销**（AICPU 调度循环） | (Thread0+Thread1+Thread2) total / 16704 | **(34292+34289+34297) / 16704 ≈ 6.16 μs** |
+| 调度/执行比 | 调度 / 执行 | **6.16 / 1.67 ≈ 3.7** |
+
+即：平均每个 task 在 AICore 上执行约 **1.67 μs**，在 AICPU 调度循环里摊到约 **6.16 μs**；调度开销约为 kernel 执行时间的 **3.7 倍**。调度开销的细分（dispatch / complete / ready_q lock / scan / yield）见上表「4）按每 task 平均」各列。
+
+若需复现或对比：用 `--enable-profiling` 跑一次，再到 `$HOME/ascend/log/debug/device-<id>/` 下找最新 `device-*.log`，用上述 grep 提取 `lock(ready_q)` 与各 phase 行；Task Statistics 见 run 终端输出的 `Task Statistics by Function` 表（Total_Exec、总 task 数）。
+
+---
+
+## 7. halMemCtl failed (rc=13) 与“运行不结束”
+
+### 现象
+
+启动时日志出现：
+
+- `[ERROR] get_aicore_reg_info: halMemCtl failed with rc=13`
+- `[ERROR] get_aicore_regs: get_aicore_reg_info failed, using placeholder addresses`
+- `[INFO] init_aicore_register_addresses: Successfully initialized ... 72 addresses at device 0x...`（24/48 配置下为 72）
+
+之后用例**一直不结束**，或设备无有效计算。
+
+### 原因
+
+`halMemCtl` 用于向 CANN HAL 查询 AICore 寄存器映射的虚拟地址。返回 **rc=13** 时（多为权限或资源被占用），host 侧会走 **fallback**：用占位地址 `0xDEADBEEF...` 填满 24 AIC + 48 AIV（72 个）寄存器基址并拷到设备。
+
+这些占位地址**不是**真实的 AICore MMIO 基址，AICPU 下发 kernel 时写的是无效地址，导致：
+
+- AICore 上 kernel 实际未执行，或
+- 访问非法地址导致设备异常/挂起，
+
+因此调度循环一直在等 AICore 完成，**运行无法正常结束**。
+
+### 建议
+
+1. **确认环境**：用与安装 CANN 一致的用户运行；同一设备不要被其他进程独占。
+2. **查 CANN 文档**：针对 `halMemCtl` 错误码 13 的官方说明（常见为权限不足或设备状态异常）。
+3. **不要依赖 placeholder**：出现 “using placeholder addresses” 时，当前 run 不能作为有效执行/性能数据；需解决 HAL 报错后再跑。
+4. **若仅做 AICPU 或仿真**：若用例不依赖真实 AICore 寄存器（例如纯 sim 或仅测 host 路径），可暂时忽略；paged attention 依赖 AICore 执行 QK/PV 等 kernel，**必须**拿到真实寄存器地址才能跑通。
+
+---
+
+## 8. 卡在 rtStreamSynchronize stream_aicpu_
+
+### 现象
+
+日志中已出现：
+
+- `Retrieved 24 AIC and 48 AIV register addresses`
+- `=== launch_aicpu_kernel DynTileFwkKernelServerInit===`
+- `=== launch_aicpu_kernel DynTileFwkKernelServer===`
+- `=== launch_aicore_kernel===`
+- `=== rtStreamSynchronize stream_aicpu_===`
+
+之后进程**一直不返回**，需 ^C 中断。
+
+### 含义
+
+Host 在等待 **AICPU stream** 上提交的任务全部完成。卡住说明 AICPU 侧（调度循环 + AICore 执行）要么未正常结束，要么设备/驱动无响应。
+
+### 可能原因
+
+- **设备异常或占用**：部分 device ID（如 13、15）在该环境下可能挂起、被占用或与 HAL/驱动配合异常。
+- **AICPU 调度或 AICore 执行死循环/死锁**：依赖未满足、完成信号未正确写回等（若多数设备都卡则重点排查）。
+
+### 建议
+
+1. **换 device**：优先使用在本机已验证能跑通的 device（例如 device 0）。例如：
+   ```bash
+   PA_CASE=Case1 python examples/scripts/run_example.py -k ... -g ... -p a2a3 -d 0
+   ```
+3. **查看 device 日志**：`$HOME/ascend/log/debug/device-<id>/` 下最新 `device-*.log`，搜索 `error`、`fail`、`507015` 等，确认设备侧是否有报错或超时。
+4. **确认 24/48 配置**：日志中应为 “24 AIC and 48 AIV”；若曾出现 25/50，需确保已用当前代码重编并重跑。
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
index b537f78bb..185554c84 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -87,7 +87,6 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     __gm__ Tensor* vj = reinterpret_cast<__gm__ Tensor*>(args[1]);
     __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]);
     uint64_t q_tile_size = static_cast<uint64_t>(pij->repeats[0]);
-    // args[4] = block_size, args[5] = head_dim
 
     if (q_tile_size == 16) {
         pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
index c7c49ce24..8b7f64771 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -88,7 +88,6 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     __gm__ Tensor* kj = reinterpret_cast<__gm__ Tensor*>(args[1]);
     __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[2]);
     uint64_t q_tile_size = static_cast<uint64_t>(qi->repeats[0]);
-    // args[4] = head_dim (128), args[5] = block_size
 
     if (q_tile_size == 16) {
         qk_matmul_impl<16, 128, 128>(qi, kj, sij);
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
index e3a1c8706..0725f32c6 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -1,8 +1,8 @@
 // Online Softmax Update + Normalize Kernel (AIV)
 //
-// Operates on full tiles where M=q_tile_size, N=head_dim (128):
-//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
-//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+// Operates on full tiles where M=q_tile_size, N=head_dim:
+//   Case1: oi/oi_new are (16, 128) row-major, mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128) row-major, mij/lij/mi/li are 64-element vectors
 //
 // Scalar layout strategy:
 //   M scalar floats stored contiguously in GM can be loaded as either:
@@ -232,7 +232,6 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     uint64_t is_first = static_cast<uint64_t>(args[7]);
     uint64_t is_last = static_cast<uint64_t>(args[8]);
     uint64_t q_tile_size = static_cast<uint64_t>(mij->repeats[0]);
-    // args[10] = head_dim (128)
 
     if (q_tile_size == 16) {
         online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 8151b4f10..28878ba0f 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -185,6 +185,10 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                     uint64_t is_first = (bn == 0) ? 1 : 0;
                     uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
 
+                    // Single ONLINE_UPDATE per block for all head_dim values.
+                    // PV writes oi_tmp in row-major (M, head_dim); ONLINE_UPDATE reads it with
+                    // matching stride. The previous 2x(16,128) split for head_dim=256 had scalar
+                    // double-counting bugs and wrong stride mismatches; removed.
                     Tensor out_view = out.view({q_tile, head_dim}, {cur_offset, 0});
                     PTOParam params_up[] = {
                         make_input_param(mi),
@@ -197,7 +201,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_scalar_param(is_first),
                         make_scalar_param(is_last),
                     };
-                    TIMED_SUBMIT_TASK(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); // v2
+                    TIMED_SUBMIT_TASK(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9);
                 }
             }
         }

From 90825fa5c714929f9181ff579d9600a29f75efe4 Mon Sep 17 00:00:00 2001
From: liaoheng <hengliao1972@163.com>
Date: Thu, 26 Feb 2026 12:15:48 +0800
Subject: [PATCH 3/6] Feature: batch paged attention with in-kernel batch loop

Implement a new batch_paged_attention architecture that moves the batch
iteration loop inside each kernel, eliminating task count explosion.

Key changes:
- Orchestrator submits constant 13 tasks regardless of batch size
- QK, Softmax, PV, Online-Update kernels process all batches internally
  via pointer arithmetic on batched tensors
- block_table and context_lens passed as scalar pointers to avoid
  exceeding PTO2 tensor parameter limits
- Kernel memory (L1/L0/UB tiles) reused across batch iterations
- Supports batch sizes from 1 to 256 with Exec/Sched ratio up to 93%

Previously batch>=16 caused AICPU scheduler hang (208+ tasks).
---
 .../TFILLPAD_INPLACE_BUG.md                   | 205 +++++++++++
 .../batch_paged_attention/golden.py           | 339 ++++++++++++++++++
 .../kernels/aic/aic_hub.cpp                   |  18 +
 .../kernels/aic/aic_pv_matmul.cpp             | 108 ++++++
 .../kernels/aic/aic_qk_matmul.cpp             | 112 ++++++
 .../kernels/aiv/aiv_hub.cpp                   |  18 +
 .../kernels/aiv/aiv_online_update.cpp         | 222 ++++++++++++
 .../kernels/aiv/aiv_softmax_prepare.cpp       | 146 ++++++++
 .../kernels/kernel_config.py                  |  45 +++
 .../orchestration/paged_attention_orch.cpp    | 198 ++++++++++
 10 files changed, 1411 insertions(+)
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/TFILLPAD_INPLACE_BUG.md
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
 create mode 100644 examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp

diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/TFILLPAD_INPLACE_BUG.md b/examples/tensormap_and_ringbuffer/batch_paged_attention/TFILLPAD_INPLACE_BUG.md
new file mode 100644
index 000000000..5d83385ac
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/TFILLPAD_INPLACE_BUG.md
@@ -0,0 +1,205 @@
+# TFILLPAD_INPLACE Bug at Small Tile Width (N ≤ 16)
+
+## Summary
+
+`TFILLPAD_INPLACE` produces incorrect padding results on Ascend A2/A3 hardware when
+the tile column count `N` is small (e.g. N=16 for float32). The bug manifests as
+corrupted data in the padded region for certain `valid_len` values, causing downstream
+softmax and attention computations to produce wrong results.
+
+## Affected Configuration
+
+- **Platform**: Ascend A2/A3 (tested on hardware, also reproduces on simulator)
+- **Data type**: float32 (sizeof=4)
+- **Tile shape**: (M, N) = (16, 16) — i.e. 2 × 32-byte blocks per row
+- **PTO source**: `include/pto/npu/a2a3/TFillPad.hpp`
+
+The bug does NOT reproduce at larger N values (N=32, 64, 128) where the same
+`valid_len` values work correctly.
+
+## Reproduction
+
+In the paged attention example (`examples/tensormap_and_ringbuffer/paged_attention/`),
+the softmax preparation kernel uses `TFILLPAD_INPLACE` to mask invalid key positions
+with `-inf` before computing softmax:
+
+```cpp
+// Tile types
+using TileSijDyn = Tile<TileType::Vec, float, 16, 16, BLayout::RowMajor, 16, -1>;
+using TileSijPad = Tile<TileType::Vec, float, 16, 16, BLayout::RowMajor, 16, 16,
+                        SLayout::NoneBox, 512, PadValue::Min>;
+
+TileSijDyn sijDynTile(valid_len);  // valid_len = number of valid columns
+TileSijPad sijPadTile;
+// Both assigned to same UB address (in-place)
+TASSIGN(sijDynTile, 0x0);
+TASSIGN(sijPadTile, 0x0);
+
+// After loading sij from GM:
+TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+// Expected: columns [valid_len, 16) filled with -inf (0xff800000)
+// Actual:   corrupted for certain valid_len values
+```
+
+### Test Matrix (N=16, float32, on hardware)
+
+| valid_len | context_len | blocks | TFILLPAD_INPLACE only | SetValue only | TFILLPAD + SetValue |
+|-----------|-------------|--------|-----------------------|---------------|---------------------|
+| 1         | 17          | 2      | FAIL (27/256)         | PASS          | PASS                |
+| 7         | 23          | 2      | FAIL (29/256)         | PASS          | PASS                |
+| 8         | 24          | 2      | FAIL (28/256)         | FAIL (182/256)| PASS                |
+| 9         | 25          | 2      | PASS                  | PASS          | PASS                |
+| 12        | 28          | 2      | PASS                  | PASS          | PASS                |
+| 15        | 31          | 2      | PASS                  | PASS          | PASS                |
+| 16 (full) | 32          | 2      | PASS                  | PASS          | PASS                |
+| 1         | 33          | 3      | FAIL (25/256)         | FAIL (88/256) | PASS                |
+
+### Cross-dimension validation (confirming N=16 is the trigger)
+
+| num_heads | head_dim | block_size (=N) | context_len | valid_len | Result |
+|-----------|----------|-----------------|-------------|-----------|--------|
+| 16        | 16       | **16**          | 33          | 1         | FAIL   |
+| 16        | 16       | **32**          | 33          | 1         | PASS   |
+| 16        | **32**   | **16**          | 33          | 1         | FAIL   |
+
+block_size determines N in the softmax tile (M, N). When block_size=32 (N=32),
+the same valid_len=1 passes. When block_size=16 (N=16), it fails regardless of
+head_dim.
+
+## Root Cause Analysis
+
+The bug is in the `TFillPad` function in `include/pto/npu/a2a3/TFillPad.hpp`.
+The function has two internal code paths for filling padding:
+
+### Path A: `Handle32BAlignedPad_Other` (lines 103-134)
+
+Fills the **partial 32-byte block** at the boundary using `vector_dup` with a
+norm-mode bitmask. This path is reliable.
+
+### Path B: `PadRightSingleRow` + `PadRightRemainingRows` (lines 136-167)
+
+Fills **complete 32-byte blocks** to the right of the boundary. Uses `vector_dup`
+for row 0, then `vcopy` with `srcRepeatStride=0` (broadcast) to replicate to
+remaining rows. **This path has the bug.**
+
+### Which path runs depends on `valid_len`
+
+The key variable is `srcValidCol32B` — the valid_len rounded up to the next
+32-byte-aligned element count:
+
+```
+elements_per_block = 32 / sizeof(float) = 8
+srcValidCol32B = ceil(valid_len / 8) * 8
+padOffset = srcValidCol32B
+padCols = N - srcValidCol32B        // columns for Path B
+pad_32B = srcValidCol32B - valid_len // columns for Path A
+```
+
+For N=16 (2 blocks of 8 elements each):
+
+```
+valid_len ∈ [1, 8]:
+    srcValidCol32B = 8
+    padOffset = 8,  padCols = 8   → Path B runs (fills block 1)
+    pad_32B = 8 - valid_len       → Path A runs if valid_len < 8
+
+valid_len ∈ [9, 15]:
+    srcValidCol32B = 16
+    padOffset = 16, padCols = 0   → Path B is a NO-OP
+    pad_32B = 16 - valid_len      → Path A runs (fills within block 1)
+
+valid_len = 16:
+    No padding needed (full block)
+```
+
+**Pattern: valid_len ≤ 8 → Path B runs → BUG. valid_len ≥ 9 → only Path A → OK.**
+
+### Path B code trace (the buggy path)
+
+```cpp
+// PadRightSingleRow: fill row 0's right padding
+set_mask_count();
+set_vector_mask(0, padCols);  // padCols = 8
+vector_dup(dstPtr + padOffset, dupPadValue, 1, 1, 1, 8, 0);
+//         ^-- dstPtr + 8 (element 8 of row 0)
+pipe_barrier(PIPE_V);
+
+// PadRightRemainingRows: broadcast row 0's pattern to rows 1..M-1
+dstRepeatStride = N * sizeof(float) / 32;  // = 16 * 4 / 32 = 2
+_dstPtr = dstPtr + padOffset + copyDstCols; // = dstPtr + 8 + 16 = dstPtr + 24
+fillRow = M - 1;  // = 15
+
+vcopy(_dstPtr, dstPtr + padOffset, 15, 1, 0, 2, 0);
+//    dst       src                rep  dB sB dR sR
+//    row1:8    row0:8             15   1  0  2  0
+//
+// dstRepeatStride=2 (64 bytes = 1 row), srcRepeatStride=0 (broadcast)
+// mask: counter mode, 8 elements (inherited from PadRightSingleRow)
+```
+
+The `vcopy` with `srcRepeatStride=0` and `dstRepeatStride=2` at N=16 appears to
+produce incorrect results on hardware. The exact hardware failure mode is unclear,
+but it consistently corrupts the padding data.
+
+### Why valid_len=8 is special
+
+When `valid_len=8`:
+- `pad_32B = 8 - 8 = 0` → Path A computes `mask = 0xff >> 8 << 8 = 0`
+- `set_vector_mask(0, 0)` is called, then `vector_dup` with zero mask
+- This is effectively a no-op, but may have undefined behavior on hardware
+- Path B still runs and produces incorrect results
+- Additionally, `SetValue`-only workaround also fails for valid_len=8,
+  suggesting the zero-mask `vector_dup` in Path A corrupts pipeline state
+
+## Workaround
+
+The working fix uses **both** `TFILLPAD_INPLACE` and scalar `SetValue` writes:
+
+```cpp
+// Step 1: TFILLPAD_INPLACE sets up vector pipeline state correctly
+//         (mask modes, barriers, etc.) even though its data output is buggy
+TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+
+// Step 2: SetValue patches the actual data with correct -inf values
+if (valid_len < static_cast<uint64_t>(N)) {
+    constexpr float NEG_INF = -__builtin_huge_valf();
+    for (int r = 0; r < M; r++) {
+        for (uint64_t c = valid_len; c < N; c++) {
+            sijTile.SetValue(static_cast<uint32_t>(r * N + c), NEG_INF);
+        }
+    }
+}
+```
+
+**Why both are needed:**
+
+| Approach               | valid_len=1 | valid_len=7 | valid_len=8 |
+|------------------------|-------------|-------------|-------------|
+| TFILLPAD_INPLACE only  | FAIL        | FAIL        | FAIL        |
+| SetValue only          | PASS        | PASS        | FAIL        |
+| TFILLPAD + SetValue    | PASS        | PASS        | PASS        |
+
+- `TFILLPAD_INPLACE` alone: Path B produces wrong data
+- `SetValue` alone: works for most cases, but valid_len=8 fails because
+  Path A's zero-mask `vector_dup` (which runs before SetValue in the
+  TFILLPAD-only case) apparently sets up necessary pipeline state that
+  subsequent vector operations depend on
+- Both together: TFILLPAD handles pipeline state, SetValue fixes the data
+
+## Scope
+
+- **Affected**: Any `TFILLPAD_INPLACE` call with float32 tiles where
+  `N ≤ 16` and `valid_len ≤ N/2` (i.e. valid data fits within the first
+  32-byte block of each row)
+- **Not affected**: N ≥ 32 (tested with N=32, 64, 128 — all pass)
+- **Not affected**: Full tiles (valid_len == N)
+- **Likely affected**: float16/bfloat16 tiles with N ≤ 32 (untested, but
+  the same code path would be triggered since elements_per_block=16 for
+  16-bit types, and the same vcopy broadcast pattern is used)
+
+## Files
+
+- Bug location: `include/pto/npu/a2a3/TFillPad.hpp`, functions
+  `PadRightSingleRow` (line 136) and `PadRightRemainingRows` (line 146)
+- Workaround applied in: `examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp`
+- Test configuration: `examples/tensormap_and_ringbuffer/paged_attention/golden.py`
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py b/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py
new file mode 100644
index 000000000..f9f42b343
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py
@@ -0,0 +1,339 @@
+"""
+Paged Attention Golden Implementation - Small Scale (16x16)
+
+Implements the online softmax algorithm for paged attention with:
+- float16 Q/K/V inputs (sim-compatible)
+- Non-transposed K storage: (total_blocks, block_size, kv_head_num, head_dim)
+- GQA support (kv_head_num=1)
+- 16x16 tile dimensions
+"""
+
+import os
+import struct
+import torch
+
+# Output tensor names
+__outputs__ = ["out"]
+
+# Tensor order matching orchestration function parameter order
+TENSOR_ORDER = ["query", "key_cache", "value_cache", "block_table", "context_lens", "out", "config"]
+
+# Comparison tolerances
+RTOL = 1e-2
+ATOL = 1e-2
+
+
+# All test cases - small scale (16x16 tiles)
+ALL_CASES = {
+    "Case1": {
+        "batch": 1,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "Case2": {
+        "batch": 1,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 128,
+        "max_model_len": 256,
+    },
+    "CaseBatch2": {
+        "batch": 2,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "CaseBatch4": {
+        "batch": 4,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "CaseBatch8": {
+        "batch": 8,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "CaseBatch16": {
+        "batch": 16,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "CaseBatch32": {
+        "batch": 32,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "CaseBatch64": {
+        "batch": 64,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "CaseBatch128": {
+        "batch": 128,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+    "CaseBatch256": {
+        "batch": 256,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+    },
+}
+
+# Select case by env var PA_CASE, default to Case1
+_selected = os.environ.get("PA_CASE", "Case1")
+PARAMS_LIST = [{"name": _selected, **ALL_CASES[_selected]}]
+
+
+def generate_inputs(params: dict) -> dict:
+    """Generate input tensors and zeroed output tensor."""
+    batch = params["batch"]
+    num_heads = params["num_heads"]
+    kv_head_num = params["kv_head_num"]
+    head_dim = params["head_dim"]
+    block_size = params["block_size"]
+    context_len = params["context_len"]
+    max_model_len = params["max_model_len"]
+
+    max_num_blocks_per_req = max_model_len // block_size
+    cur_valid_blocks = (context_len + block_size - 1) // block_size
+    total_blocks = batch * cur_valid_blocks
+    scale_value = 1.0
+    scale_bits = struct.unpack('I', struct.pack('f', scale_value))[0]
+
+    # Random block table: (batch, max_num_blocks_per_req) int32
+    block_table = torch.randint(
+        0,
+        max(total_blocks, 1),
+        size=(batch, max_num_blocks_per_req),
+        dtype=torch.int32,
+    )
+
+    # Context lens: all = context_len
+    context_lens = torch.full((batch,), context_len, dtype=torch.int32)
+
+    config = torch.tensor(
+        [batch, num_heads, kv_head_num, head_dim, block_size,
+         max_num_blocks_per_req, scale_bits],
+        dtype=torch.int64,
+    )
+
+    # Query: (batch, 1, num_heads * head_dim) -> (batch, num_heads, head_dim) float16
+    query_fp16 = torch.empty(batch, 1, num_heads * head_dim).uniform_(-0.5, 0.5).to(torch.float16)
+    query_fp16 = query_fp16.reshape(batch, num_heads, head_dim)
+
+    # Key cache: (total_blocks, block_size, kv_head_num, head_dim) float16
+    key_fp16 = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-0.5, 0.5).to(torch.float16)
+
+    # Value cache: (total_blocks, block_size, kv_head_num, head_dim) float16
+    value_fp16 = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-1, 1).to(torch.float16)
+
+    return {
+        "query": query_fp16.flatten(),
+        "key_cache": key_fp16.flatten(),
+        "value_cache": value_fp16.flatten(),
+        "block_table": block_table.flatten(),
+        "context_lens": context_lens,
+        "out": torch.zeros(batch * num_heads * head_dim, dtype=torch.float32),
+        "config": config,
+    }
+
+
+def paged_attention(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    num_kv_heads: int,
+    num_heads: int,
+    scale_value: float,
+    block_table: torch.Tensor,
+    context_lens: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Compute paged attention using online softmax with head tiling and GQA.
+
+    Vectorized across the batch dimension for performance.
+    Supports different context_lens per batch via masking.
+
+    Args:
+        query: (batch, num_heads, head_dim) bfloat16
+        key_cache: (total_blocks, block_size, num_kv_heads, head_dim) bfloat16
+        value_cache: (total_blocks, block_size, num_kv_heads, head_dim) bfloat16
+        num_kv_heads: int
+        num_heads: int
+        scale_value: float
+        block_table: (batch, block_num) int32
+        context_lens: (batch,) int32
+
+    Returns:
+        out: (batch * num_heads, head_dim) float32
+    """
+    assert num_kv_heads == 1
+    batch, num_heads_dim, head_dim = query.shape
+    _, block_size, _, _ = key_cache.shape
+
+    # Reshape for batched computation
+    key_cache_flat = key_cache.reshape(-1, block_size, head_dim)
+    value_cache_flat = value_cache.reshape(-1, block_size, head_dim)
+
+    out = torch.zeros((batch, num_heads_dim, head_dim), dtype=torch.float32)
+
+    q_tile = min(num_heads_dim, 128)
+
+    # Max blocks across all batches (each batch may have different context_len)
+    max_bn = int(((context_lens.max().item()) + block_size - 1) // block_size)
+
+    for q_offset in range(0, num_heads_dim, q_tile):
+        q_tile_size = min(q_tile, num_heads_dim - q_offset)
+        # qi: (batch, q_tile_size, head_dim)
+        qi = query[:, q_offset:q_offset + q_tile_size, :].to(torch.float32)
+
+        oi = None  # (batch, q_tile_size, head_dim)
+        li = None  # (batch, q_tile_size, 1)
+        mi = None  # (batch, q_tile_size, 1)
+
+        for bn in range(max_bn):
+            # valid_len per batch for this block position
+            valid_lens = torch.clamp(context_lens - bn * block_size, min=0, max=block_size)
+            active_mask = valid_lens > 0  # (batch,)
+
+            if not active_mask.any():
+                break
+
+            # Gather block indices for all batches
+            block_indices = block_table[:, bn]  # (batch,)
+
+            # Gather K and V: (batch, block_size, head_dim)
+            kj_all = key_cache_flat[block_indices].to(torch.float32)
+            vj_all = value_cache_flat[block_indices].to(torch.float32)
+
+            # QK matmul: (batch, q_tile_size, block_size)
+            sij = torch.bmm(qi, kj_all.transpose(1, 2)) * scale_value
+
+            # Mask out invalid positions (beyond valid_len per batch)
+            pos = torch.arange(block_size, device=sij.device).unsqueeze(0)  # (1, block_size)
+            valid_mask = pos < valid_lens.unsqueeze(1)  # (batch, block_size)
+            valid_mask = valid_mask.unsqueeze(1)  # (batch, 1, block_size)
+            sij = sij.masked_fill(~valid_mask, float('-inf'))
+
+            # Also mask inactive batches (no blocks at this position)
+            batch_mask = active_mask.view(-1, 1, 1)  # (batch, 1, 1)
+            sij = sij.masked_fill(~batch_mask, float('-inf'))
+
+            mij = sij.max(dim=-1, keepdim=True)[0]  # (batch, q_tile_size, 1)
+            mij = mij.clamp(min=-1e30)
+            pij = torch.exp(sij - mij)
+            pij = pij.masked_fill(~valid_mask, 0.0)
+            pij = pij.masked_fill(~batch_mask, 0.0)
+            pij = pij.to(torch.bfloat16).to(torch.float32)
+            lij = pij.sum(dim=-1, keepdim=True)  # (batch, q_tile_size, 1)
+
+            # PV matmul: (batch, q_tile_size, head_dim)
+            oi_new = torch.bmm(pij, vj_all)
+
+            if bn == 0:
+                oi = oi_new
+                li = lij
+                mi = mij
+            else:
+                mi_new = torch.maximum(mi, mij)
+                alpha = torch.exp(mi - mi_new)
+                beta = torch.exp(mij - mi_new)
+                li = alpha * li + beta * lij
+                oi = alpha * oi + beta * oi_new
+                mi = mi_new
+
+        # Final normalization
+        out[:, q_offset:q_offset + q_tile_size, :] = oi / li
+
+    return out.reshape(-1, head_dim)
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    """Compute expected output in-place using online softmax paged attention."""
+    batch = params["batch"]
+    num_heads = params["num_heads"]
+    kv_head_num = params["kv_head_num"]
+    head_dim = params["head_dim"]
+    block_size = params["block_size"]
+    max_model_len = params["max_model_len"]
+
+    max_num_blocks_per_req = max_model_len // block_size
+
+    # Reconstruct shaped tensors from flat tensors
+    query = tensors["query"].reshape(batch, num_heads, head_dim)
+    key_cache = tensors["key_cache"].reshape(-1, block_size, kv_head_num, head_dim)
+    value_cache = tensors["value_cache"].reshape(-1, block_size, kv_head_num, head_dim)
+    block_table = tensors["block_table"].reshape(batch, max_num_blocks_per_req)
+    context_lens = tensors["context_lens"]
+
+    out = paged_attention(
+        query=query,
+        key_cache=key_cache,
+        value_cache=value_cache,
+        num_kv_heads=kv_head_num,
+        num_heads=num_heads,
+        scale_value=1.0,
+        block_table=block_table,
+        context_lens=context_lens,
+    )
+
+    tensors["out"][:] = out.flatten()
+
+
+if __name__ == "__main__":
+    params = PARAMS_LIST[0]
+    tensors = generate_inputs(params)
+    compute_golden(tensors, params)
+
+    print(f"=== Paged Attention Golden Test ({params['name']}) ===")
+    print(f"batch={params['batch']}, num_heads={params['num_heads']}, head_dim={params['head_dim']}")
+    print(f"kv_head_num={params['kv_head_num']}, block_size={params['block_size']}")
+    print(f"context_len={params['context_len']}")
+
+    max_num_blocks = params['max_model_len'] // params['block_size']
+    q_tile = min(params['num_heads'], 128)
+    print(f"max_num_blocks_per_req={max_num_blocks}, q_tile_size={q_tile}")
+
+    out = tensors["out"].reshape(params["batch"] * params["num_heads"], params["head_dim"])
+    print(f"Output shape: {out.shape}")
+    print(f"Output range: [{out.min():.4f}, {out.max():.4f}]")
+    print(f"Output mean: {out.mean():.4f}")
+    print("Golden test passed!")
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
new file mode 100644
index 000000000..0974de371
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
@@ -0,0 +1,18 @@
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+constexpr int M = 16;
+constexpr int K = 16;
+constexpr int N = 16;
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {}
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 000000000..bea8c7305
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,108 @@
+// Batched PV Matmul Kernel: for each batch b, pij(M, K) @ vj(K, N) -> oi_new(M, N)
+//
+// Processes batch_count batches in a single kernel invocation.
+// Per-batch addresses are computed from global tensor bases + block_table lookup.
+//
+// Template: M=q_tile, K=block_size, N=head_dim (all 16 for current config)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_batch_impl(
+    __gm__ Tensor* pij_batch,
+    __gm__ Tensor* value_cache,
+    __gm__ Tensor* oi_new_batch,
+    uint64_t block_table_ptr,
+    uint64_t batch_count,
+    uint64_t block_idx,
+    uint64_t block_num) {
+
+    __gm__ half* pij_base = reinterpret_cast<__gm__ half*>(pij_batch->buffer.addr);
+    __gm__ half* val_base = reinterpret_cast<__gm__ half*>(value_cache->buffer.addr);
+    __gm__ float* oi_base = reinterpret_cast<__gm__ float*>(oi_new_batch->buffer.addr);
+    __gm__ int32_t* bt = reinterpret_cast<__gm__ int32_t*>(block_table_ptr);
+
+    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<half, M, K, M, K>;
+    using RightTile = TileRight<half, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    for (uint64_t b = 0; b < batch_count; b++) {
+        __gm__ half* pij_addr = pij_base + b * M * K;
+        int32_t phys_block = bt[b * block_num + block_idx];
+        __gm__ half* vj_addr = val_base + (uint64_t)phys_block * K * N;
+        __gm__ float* oi_addr = oi_base + b * M * N;
+
+        GlobalA pijGlobal(pij_addr);
+        GlobalB vjGlobal(vj_addr);
+        GlobalOut oiGlobal(oi_addr);
+
+        TLOAD(aMatTile, pijGlobal);
+        TLOAD(bMatTile, vjGlobal);
+
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        TMOV(aTile, aMatTile);
+        TMOV(bTile, bMatTile);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        TMATMUL(cTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        TSTORE(oiGlobal, cTile);
+
+        if (b + 1 < batch_count) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* pij_batch = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* value_cache = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* oi_new_batch = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    uint64_t block_table_ptr = static_cast<uint64_t>(args[3]);
+    uint64_t batch_count = static_cast<uint64_t>(args[4]);
+    uint64_t block_idx = static_cast<uint64_t>(args[5]);
+    uint64_t block_num = static_cast<uint64_t>(args[6]);
+
+    pv_matmul_batch_impl<16, 16, 16>(
+        pij_batch, value_cache, oi_new_batch,
+        block_table_ptr, batch_count, block_idx, block_num);
+}
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 000000000..ae467d724
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,112 @@
+// Batched QK Matmul Kernel: for each batch b, qi(M, K) @ kj.T(K, N) -> sij(M, N)
+//
+// Processes batch_count batches in a single kernel invocation.
+// Per-batch addresses are computed from global tensor bases + block_table lookup.
+//
+// Template: M=q_tile, K=head_dim, N=block_size (all 16 for current config)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_batch_impl(
+    __gm__ Tensor* query,
+    __gm__ Tensor* key_cache,
+    __gm__ Tensor* sij_batch,
+    uint64_t block_table_ptr,
+    uint64_t batch_count,
+    uint64_t block_idx,
+    uint64_t q_offset,
+    uint64_t block_num,
+    uint64_t num_heads) {
+
+    __gm__ half* query_base = reinterpret_cast<__gm__ half*>(query->buffer.addr);
+    __gm__ half* key_base = reinterpret_cast<__gm__ half*>(key_cache->buffer.addr);
+    __gm__ float* sij_base = reinterpret_cast<__gm__ float*>(sij_batch->buffer.addr);
+    __gm__ int32_t* bt = reinterpret_cast<__gm__ int32_t*>(block_table_ptr);
+
+    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    using LeftTile = TileLeft<half, M, K, M, K>;
+    using RightTile = TileRight<half, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    for (uint64_t b = 0; b < batch_count; b++) {
+        __gm__ half* qi_addr = query_base + (b * num_heads + q_offset) * K;
+        int32_t phys_block = bt[b * block_num + block_idx];
+        __gm__ half* kj_addr = key_base + (uint64_t)phys_block * N * K;
+        __gm__ float* sij_addr = sij_base + b * M * N;
+
+        GlobalA qiGlobal(qi_addr);
+        GlobalB kjGlobal(kj_addr);
+        GlobalOut sijGlobal(sij_addr);
+
+        TLOAD(aMatTile, qiGlobal);
+        TLOAD(bMatTile, kjGlobal);
+
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        TMOV(aTile, aMatTile);
+        TMOV(bTile, bMatTile);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        TMATMUL(cTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        TSTORE(sijGlobal, cTile);
+
+        if (b + 1 < batch_count) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* query = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* key_cache = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* sij_batch = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    uint64_t block_table_ptr = static_cast<uint64_t>(args[3]);
+    uint64_t batch_count = static_cast<uint64_t>(args[4]);
+    uint64_t block_idx = static_cast<uint64_t>(args[5]);
+    uint64_t q_offset = static_cast<uint64_t>(args[6]);
+    uint64_t block_num = static_cast<uint64_t>(args[7]);
+    uint64_t num_heads = static_cast<uint64_t>(args[8]);
+
+    qk_matmul_batch_impl<16, 16, 16>(
+        query, key_cache, sij_batch,
+        block_table_ptr, batch_count, block_idx, q_offset, block_num, num_heads);
+}
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
new file mode 100644
index 000000000..0974de371
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
@@ -0,0 +1,18 @@
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+constexpr int M = 16;
+constexpr int K = 16;
+constexpr int N = 16;
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {}
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 000000000..f0c082e3c
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,222 @@
+// Batched Online Softmax Update + Normalize Kernel (AIV)
+//
+// Processes batch_count batches in a single kernel invocation.
+// For each batch b, updates accumulators mi/li/oi with new block's mij/lij/oi_new.
+// On is_last, normalizes and writes to the output tensor at the correct batch offset.
+//
+// Scalar layout strategy (unchanged from unbatched version):
+//   M scalar floats stored contiguously in GM can be loaded as either:
+//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops
+//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops
+//   Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_batch_impl(
+    __gm__ Tensor* mij_batch,
+    __gm__ Tensor* lij_batch,
+    __gm__ Tensor* oi_new_batch,
+    __gm__ Tensor* mi_batch,
+    __gm__ Tensor* li_batch,
+    __gm__ Tensor* oi_batch,
+    __gm__ Tensor* out,
+    uint64_t is_first,
+    uint64_t is_last,
+    uint64_t batch_count,
+    uint64_t q_offset,
+    uint64_t num_heads) {
+
+    __gm__ float* mij_base = reinterpret_cast<__gm__ float*>(mij_batch->buffer.addr);
+    __gm__ float* lij_base = reinterpret_cast<__gm__ float*>(lij_batch->buffer.addr);
+    __gm__ float* oi_new_base = reinterpret_cast<__gm__ float*>(oi_new_batch->buffer.addr);
+    __gm__ float* mi_base = reinterpret_cast<__gm__ float*>(mi_batch->buffer.addr);
+    __gm__ float* li_base = reinterpret_cast<__gm__ float*>(li_batch->buffer.addr);
+    __gm__ float* oi_base = reinterpret_cast<__gm__ float*>(oi_batch->buffer.addr);
+    __gm__ float* out_base = reinterpret_cast<__gm__ float*>(out->buffer.addr);
+
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, Stride<1, 1, 1, kScalarCols, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    TileScalarND mijND, lijND, miND, liND;
+    TileScalarND miNewND, alphaND, betaND, tmpND;
+
+    TileScalarDN alphaDN, betaDN, liDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijND, 2 * kDataBytes);
+    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
+    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
+    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
+    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
+    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
+    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
+    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
+
+    for (uint64_t b = 0; b < batch_count; b++) {
+        __gm__ float* mij_ptr = mij_base + b * M;
+        __gm__ float* lij_ptr = lij_base + b * M;
+        __gm__ float* oi_new_ptr = oi_new_base + b * M * N;
+        __gm__ float* mi_ptr = mi_base + b * M;
+        __gm__ float* li_ptr = li_base + b * M;
+        __gm__ float* oi_ptr = oi_base + b * M * N;
+        __gm__ float* dst_ptr = out_base + (b * num_heads + q_offset) * N;
+
+        GlobalDataMxN oiNewGlobal(oi_new_ptr);
+        GlobalDataMxN oiGlobal(oi_ptr);
+        GlobalDataMxN dstGlobal(dst_ptr);
+
+        GlobalScalarND mijGlobalND(mij_ptr);
+        GlobalScalarND lijGlobalND(lij_ptr);
+        GlobalScalarND miGlobalND(mi_ptr);
+        GlobalScalarND liGlobalND(li_ptr);
+
+        GlobalScalarDN mijGlobalDN(mij_ptr);
+        GlobalScalarDN lijGlobalDN(lij_ptr);
+        GlobalScalarDN liGlobalDN(li_ptr);
+
+        if (is_first) {
+            TLOAD(oiNewTile, oiNewGlobal);
+            TLOAD(mijND, mijGlobalND);
+            TLOAD(lijND, lijGlobalND);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, mijND);
+            TSTORE(liGlobalND, lijND);
+            TSTORE(oiGlobal, oiNewTile);
+
+            if (is_last) {
+                set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+                wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+                TLOAD(liDN, liGlobalDN);
+                set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+                wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+                TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                TSTORE(dstGlobal, oiNewTile);
+            }
+        } else {
+            TLOAD(oiNewTile, oiNewGlobal);
+            TLOAD(oiTile, oiGlobal);
+            TLOAD(mijND, mijGlobalND);
+            TLOAD(lijND, lijGlobalND);
+            TLOAD(miND, miGlobalND);
+            TLOAD(liND, liGlobalND);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            TMAX(miNewND, miND, mijND);
+            pipe_barrier(PIPE_V);
+            TSUB(alphaND, miND, miNewND);
+            pipe_barrier(PIPE_V);
+            TEXP(alphaND, alphaND);
+            pipe_barrier(PIPE_V);
+            TSUB(betaND, mijND, miNewND);
+            pipe_barrier(PIPE_V);
+            TEXP(betaND, betaND);
+            pipe_barrier(PIPE_V);
+            TMUL(liND, alphaND, liND);
+            pipe_barrier(PIPE_V);
+            TMUL(tmpND, betaND, lijND);
+            pipe_barrier(PIPE_V);
+            TADD(liND, liND, tmpND);
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);
+            TSTORE(liGlobalND, liND);
+            TSTORE(mijGlobalND, alphaND);
+            TSTORE(lijGlobalND, betaND);
+
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            TLOAD(alphaDN, mijGlobalDN);
+            TLOAD(betaDN, lijGlobalDN);
+            if (is_last) {
+                TLOAD(liDN, liGlobalDN);
+            }
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+
+            TROWEXPANDMUL(oiTile, oiTile, alphaDN);
+            TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);
+            pipe_barrier(PIPE_V);
+            TADD(oiTile, oiTile, oiNewTile);
+
+            if (is_last) {
+                pipe_barrier(PIPE_V);
+                TROWEXPANDDIV(oiTile, oiTile, liDN);
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                TSTORE(dstGlobal, oiTile);
+            } else {
+                set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+                TSTORE(oiGlobal, oiTile);
+            }
+        }
+
+        if (b + 1 < batch_count) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* mij_batch = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* lij_batch = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* oi_new_batch = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* mi_batch = reinterpret_cast<__gm__ Tensor*>(args[3]);
+    __gm__ Tensor* li_batch = reinterpret_cast<__gm__ Tensor*>(args[4]);
+    __gm__ Tensor* oi_batch = reinterpret_cast<__gm__ Tensor*>(args[5]);
+    __gm__ Tensor* out = reinterpret_cast<__gm__ Tensor*>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t batch_count = static_cast<uint64_t>(args[9]);
+    uint64_t q_offset = static_cast<uint64_t>(args[10]);
+    uint64_t num_heads = static_cast<uint64_t>(args[11]);
+
+    online_update_batch_impl<16, 16>(
+        mij_batch, lij_batch, oi_new_batch,
+        mi_batch, li_batch, oi_batch, out,
+        is_first, is_last, batch_count, q_offset, num_heads);
+}
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 000000000..656271423
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,146 @@
+// Batched Softmax Preparation Kernel (AIV)
+//
+// Processes batch_count batches in a single kernel invocation.
+// For each batch b at block_idx bn:
+//   valid_len = min(N, context_lens[b] - bn * N)
+//   sij_masked = pad(sij[b], valid_len, -inf)
+//   sij_scale  = sij_masked * scale
+//   mij[b]     = row_max(sij_scale)
+//   pij[b]     = exp(sij_scale - mij[b])  (truncated to fp16 then back)
+//   lij[b]     = row_sum(pij[b])
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_batch_impl(
+    __gm__ Tensor* sij_batch,
+    __gm__ Tensor* pij_batch,
+    __gm__ Tensor* mij_batch,
+    __gm__ Tensor* lij_batch,
+    float scale_value,
+    uint64_t context_lens_ptr,
+    uint64_t batch_count,
+    uint64_t block_idx) {
+
+    __gm__ float* sij_base = reinterpret_cast<__gm__ float*>(sij_batch->buffer.addr);
+    __gm__ half* pij_base = reinterpret_cast<__gm__ half*>(pij_batch->buffer.addr);
+    __gm__ float* mij_base = reinterpret_cast<__gm__ float*>(mij_batch->buffer.addr);
+    __gm__ float* lij_base = reinterpret_cast<__gm__ float*>(lij_batch->buffer.addr);
+    __gm__ int32_t* ctx_lens = reinterpret_cast<__gm__ int32_t*>(context_lens_ptr);
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_f16 = GlobalTensor<half, Shape<1, 1, 1, M, N>, Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_f16 = Tile<TileType::Vec, half, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    TileVecMxN sijTile;
+    TileSijPad sijPadTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileScalarDN maxTile;
+    TileScalarDN sumTile;
+    TileVecMxN_f16 pijF16Tile;
+
+    TASSIGN(sijTile, 0x0);
+    TASSIGN(sijPadTile, 0x0);
+    TASSIGN(pijTile, M * N * sizeof(float));
+    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
+    TASSIGN(maxTile, 3 * M * N * sizeof(float));
+    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
+    TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+
+    for (uint64_t b = 0; b < batch_count; b++) {
+        int32_t cur_seq = ctx_lens[b];
+        uint64_t start = block_idx * N;
+        uint64_t valid_len = N;
+        if (start < (uint64_t)cur_seq) {
+            uint64_t remaining = (uint64_t)cur_seq - start;
+            if (remaining < (uint64_t)N) valid_len = remaining;
+        }
+
+        __gm__ float* sij_addr = sij_base + b * M * N;
+        __gm__ half* pij_addr = pij_base + b * M * N;
+        __gm__ float* mij_addr = mij_base + b * M;
+        __gm__ float* lij_addr = lij_base + b * M;
+
+        GlobalDataMxN sijGlobal(sij_addr);
+        GlobalDataMxN_f16 pijGlobal(pij_addr);
+        GlobalScalarDN mijGlobal(mij_addr);
+        GlobalScalarDN lijGlobal(lij_addr);
+
+        TLOAD(sijTile, sijGlobal);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+        TASSIGN(sijDynTile, 0x0);
+        TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+        if (valid_len < static_cast<uint64_t>(N)) {
+            constexpr float NEG_INF = -__builtin_huge_valf();
+            for (int r = 0; r < M; r++) {
+                for (uint64_t c = valid_len; c < N; c++) {
+                    sijTile.SetValue(static_cast<uint32_t>(r * N + c), NEG_INF);
+                }
+            }
+        }
+
+        TMULS(sijTile, sijTile, scale_value);
+        pipe_barrier(PIPE_V);
+        TROWMAX(maxTile, sijTile, tmpTile);
+        pipe_barrier(PIPE_V);
+        TROWEXPANDSUB(pijTile, sijTile, maxTile);
+        pipe_barrier(PIPE_V);
+        TEXP(pijTile, pijTile);
+        TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND);
+        TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND);
+        TROWSUM(sumTile, pijTile, tmpTile);
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(mijGlobal, maxTile);
+        TSTORE(lijGlobal, sumTile);
+        TSTORE(pijGlobal, pijF16Tile);
+
+        if (b + 1 < batch_count) {
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* sij_batch = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* pij_batch = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* mij_batch = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* lij_batch = reinterpret_cast<__gm__ Tensor*>(args[3]);
+    union { uint64_t u; float f; } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[4]);
+    float scale_value = scale_conv.f;
+    uint64_t context_lens_ptr = static_cast<uint64_t>(args[5]);
+    uint64_t batch_count = static_cast<uint64_t>(args[6]);
+    uint64_t block_idx = static_cast<uint64_t>(args[7]);
+
+    softmax_prepare_batch_impl<16, 16>(
+        sij_batch, pij_batch, mij_batch, lij_batch,
+        scale_value, context_lens_ptr, batch_count, block_idx);
+}
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
new file mode 100644
index 000000000..6ce6a0dbf
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
@@ -0,0 +1,45 @@
+"""
+Paged Attention Kernel and Orchestration Configuration
+
+Defines the kernels and orchestration function for paged attention
+with AIC/AIV subgraph splitting:
+
+AIC Kernels (Matrix Multiplication):
+  - aic_qk_matmul: Q @ K^T computation
+  - aic_pv_matmul: P @ V computation
+
+AIV Kernels (Vector Operations):
+  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
+  - aiv_online_update: online softmax accumulation + fused normalization
+
+Note: aiv_normalize has been merged into aiv_online_update for efficiency.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+
+# Orchestration config
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+# Kernel configs (aiv_normalize removed - merged into aiv_online_update)
+KERNELS = [
+    # AIC kernels (matrix multiplication using Cube unit)
+    {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),       "core_type": "aic"},
+    {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),       "core_type": "aic"},
+    {"func_id": 4, "name": "AIC_HUB", "source": str(_KERNELS_ROOT / "aic" / "aic_hub.cpp"),       "core_type": "aic"},
+    # AIV kernels (vector operations)
+    {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"},
+    {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),   "core_type": "aiv"},
+    {"func_id": 5, "name": "AIV_HUB", "source": str(_KERNELS_ROOT / "aiv" / "aiv_hub.cpp"),       "core_type": "aiv"},
+]
+
+# Runtime configuration
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "block_dim": 24,
+}
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 000000000..29964f767
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,198 @@
+/**
+ * Batch Paged Attention Orchestration Function - 16x16 Version
+ *
+ * Batched architecture: the batch loop is moved inside kernels,
+ * so task count is fixed at 1 + max_bn * 4 regardless of batch size.
+ *
+ * Memory Layout:
+ *   Query: (batch * num_heads, head_dim) fp16
+ *   Key:   (total_blocks, block_size, head_dim) fp16 (stored as K^T for QK)
+ *   Value: (total_blocks, block_size, head_dim) fp16
+ *
+ * Intermediate batched tensors (contiguous across batch dimension):
+ *   sij_batch:     (batch * q_tile, block_size)  fp32
+ *   pij_batch:     (batch * q_tile, block_size)  fp16
+ *   mij/lij_batch: (batch * q_tile)              fp32
+ *   oi_new_batch:  (batch * q_tile, head_dim)    fp32
+ *   oi_batch:      (batch * q_tile, head_dim)    fp32  accumulator
+ *   mi/li_batch:   (batch * q_tile)              fp32  accumulator
+ *
+ * Kernels receive global tensors + scalar metadata and compute per-batch
+ * addresses internally, reusing L1/L0/UB tile buffers across iterations.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+#define FUNC_AIC_HUB 4
+#define FUNC_AIV_HUB 5
+
+static uint64_t float_to_u64(float f) {
+    union {
+        float f32;
+        uint64_t u64;
+    } conv;
+    conv.u64 = 0;
+    conv.f32 = f;
+    return conv.u64;
+}
+
+extern "C" {
+
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count) {
+    (void)args;
+    (void)arg_count;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 7,
+    };
+}
+
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
+    (void)arg_count;
+
+    void* host_query = (void*)(uintptr_t)args[0];
+    void* host_key_cache = (void*)(uintptr_t)args[1];
+    void* host_value_cache = (void*)(uintptr_t)args[2];
+    int* host_block_table = (int*)(uintptr_t)args[3];
+    int* host_context_lens = (int*)(uintptr_t)args[4];
+    void* host_out = (void*)(uintptr_t)args[5];
+    int64_t* host_config = (int64_t*)(uintptr_t)args[6];
+
+    size_t key_cache_size = (size_t)args[8];
+
+    uint64_t batch = (uint64_t)(int)host_config[0];
+    uint64_t num_heads = (uint64_t)(int)host_config[1];
+    uint64_t head_dim = (uint64_t)(int)host_config[3];
+    uint64_t block_size = (uint64_t)(int)host_config[4];
+    uint64_t block_num = (uint64_t)(int)host_config[5];
+    union { uint32_t u; float f; } scale_conv;
+    scale_conv.u = (uint32_t)host_config[6];
+    float scale_value = scale_conv.f;
+
+    uint64_t q_tile = 16;
+    uint64_t q_loop = (num_heads + q_tile - 1) / q_tile;
+    DataType data_type = DataType::FLOAT16;
+    uint64_t elem_size = get_element_size(data_type);
+
+    LOG_INFO(rt, "batch_paged_attention: batch=%lu, num_heads=%lu",
+             (unsigned long)batch, (unsigned long)num_heads);
+
+    uint64_t max_bn = 0;
+    for (uint64_t b = 0; b < batch; b++) {
+        uint64_t cur_seq = host_context_lens[b];
+        uint64_t bn_b = (cur_seq + block_size - 1) / block_size;
+        if (bn_b > max_bn) max_bn = bn_b;
+    }
+
+    uint64_t query_shapes[2] = {batch * num_heads, head_dim};
+    uint64_t kv_total_rows = key_cache_size / (head_dim * elem_size);
+    uint64_t key_cache_shapes[2] = {kv_total_rows, head_dim};
+    uint64_t value_cache_shapes[2] = {kv_total_rows, head_dim};
+    uint64_t out_shapes[2] = {batch * num_heads, head_dim};
+
+    Tensor query = make_tensor_external(host_query, query_shapes, 2, data_type);
+    Tensor key_cache = make_tensor_external(host_key_cache, key_cache_shapes, 2, data_type);
+    Tensor value_cache = make_tensor_external(host_value_cache, value_cache_shapes, 2, data_type);
+    Tensor out = make_tensor_external(host_out, out_shapes, 2, DataType::FLOAT32);
+
+    uint64_t bt_addr = (uint64_t)(uintptr_t)host_block_table;
+    uint64_t cl_addr = (uint64_t)(uintptr_t)host_context_lens;
+
+    for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+        PTO2_SCOPE(rt) {
+            uint64_t q_offset = q_idx * q_tile;
+
+            uint64_t oi_acc_shapes[2] = {batch * q_tile, head_dim};
+            uint64_t scalar_acc_shapes[1] = {batch * q_tile};
+            Tensor oi_batch = make_tensor(oi_acc_shapes, 2, DataType::FLOAT32);
+            Tensor li_batch = make_tensor(scalar_acc_shapes, 1, DataType::FLOAT32);
+            Tensor mi_batch = make_tensor(scalar_acc_shapes, 1, DataType::FLOAT32);
+
+            PTOParam params_hub[] = {
+                make_output_param(oi_batch),
+                make_output_param(li_batch),
+                make_output_param(mi_batch),
+            };
+            pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_hub, 3);
+
+            for (uint64_t bn = 0; bn < max_bn; bn++) {
+                uint64_t sij_shapes[2] = {batch * q_tile, block_size};
+                uint64_t vec_shapes[1] = {batch * q_tile};
+                uint64_t oi_new_shapes[2] = {batch * q_tile, head_dim};
+
+                Tensor sij_b = make_tensor(sij_shapes, 2, DataType::FLOAT32);
+                Tensor pij_b = make_tensor(sij_shapes, 2, data_type);
+                Tensor mij_b = make_tensor(vec_shapes, 1, DataType::FLOAT32);
+                Tensor lij_b = make_tensor(vec_shapes, 1, DataType::FLOAT32);
+                Tensor oi_new_b = make_tensor(oi_new_shapes, 2, DataType::FLOAT32);
+
+                PTOParam params_qk[] = {
+                    make_input_param(query),
+                    make_input_param(key_cache),
+                    make_output_param(sij_b),
+                    make_scalar_param(bt_addr),
+                    make_scalar_param(batch),
+                    make_scalar_param(bn),
+                    make_scalar_param(q_offset),
+                    make_scalar_param(block_num),
+                    make_scalar_param(num_heads),
+                };
+                pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 9);
+
+                PTOParam params_sf[] = {
+                    make_input_param(sij_b),
+                    make_output_param(pij_b),
+                    make_output_param(mij_b),
+                    make_output_param(lij_b),
+                    make_scalar_param(float_to_u64(scale_value)),
+                    make_scalar_param(cl_addr),
+                    make_scalar_param(batch),
+                    make_scalar_param(bn),
+                };
+                pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 8);
+
+                PTOParam params_pv[] = {
+                    make_input_param(pij_b),
+                    make_input_param(value_cache),
+                    make_output_param(oi_new_b),
+                    make_scalar_param(bt_addr),
+                    make_scalar_param(batch),
+                    make_scalar_param(bn),
+                    make_scalar_param(block_num),
+                };
+                pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 7);
+
+                uint64_t is_first = (bn == 0) ? 1 : 0;
+                uint64_t is_last = (bn == max_bn - 1) ? 1 : 0;
+                PTOParam params_up[] = {
+                    make_input_param(mij_b),
+                    make_input_param(lij_b),
+                    make_input_param(oi_new_b),
+                    make_inout_param(mi_batch),
+                    make_inout_param(li_batch),
+                    make_output_param(oi_batch),
+                    make_output_param(out),
+                    make_scalar_param(is_first),
+                    make_scalar_param(is_last),
+                    make_scalar_param(batch),
+                    make_scalar_param(q_offset),
+                    make_scalar_param(num_heads),
+                };
+                pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 12);
+            }
+        }
+    }
+
+    LOG_INFO(rt, "batch_paged_attention: %lu tasks (batch=%lu, max_bn=%lu)",
+             (unsigned long)(1 + max_bn * 4), (unsigned long)batch, (unsigned long)max_bn);
+}
+
+}  // extern "C"

From dbda223495677fce3fd15db3dd5907a5d3b1d6d5 Mon Sep 17 00:00:00 2001
From: liaoheng <hengliao1972@163.com>
Date: Thu, 26 Feb 2026 12:15:57 +0800
Subject: [PATCH 4/6] Docs: add performance analysis tools and documentation

- Enhance swimlane_converter with task statistics and profiling output
- Add tail_oh_breakdown.py for scheduler overhead analysis
- Add Case1 Tail OH breakdown documentation
- Add batch paged attention performance summary (batch 1-256)
- Add scheduler overhead analysis notes
---
 docs/batch_paged_attention_perf_summary.md |  94 ++++
 docs/case1_tail_oh_breakdown.md            | 537 +++++++++++++++++++++
 tools/scheduler_overhead_analysis.md       |   0
 tools/swimlane_converter.py                |  61 ++-
 tools/tail_oh_breakdown.py                 | 196 ++++++++
 5 files changed, 885 insertions(+), 3 deletions(-)
 create mode 100644 docs/batch_paged_attention_perf_summary.md
 create mode 100644 docs/case1_tail_oh_breakdown.md
 create mode 100644 tools/scheduler_overhead_analysis.md
 create mode 100644 tools/tail_oh_breakdown.py

diff --git a/docs/batch_paged_attention_perf_summary.md b/docs/batch_paged_attention_perf_summary.md
new file mode 100644
index 000000000..899d3b5f7
--- /dev/null
+++ b/docs/batch_paged_attention_perf_summary.md
@@ -0,0 +1,94 @@
+# Batch Paged Attention 性能对比总结
+
+## 测试用例一览
+
+所有用例参数一致（num_heads=16, head_dim=16, block_size=16, context_len=33, max_model_len=256），仅 batch 大小不同。
+
+| 用例 | Batch |
+|------|-------|
+| Case1 | 1 |
+| CaseBatch2 | 2 |
+| CaseBatch4 | 4 |
+| CaseBatch8 | 8 |
+| CaseBatch16 | 16 |
+| CaseBatch32 | 32 |
+| CaseBatch64 | 64 |
+| CaseBatch128 | 128 |
+| CaseBatch256 | 256 |
+
+## 架构对比
+
+### 旧架构（paged_attention/）
+
+- 编排器为每个 batch 生成一组独立任务（QK、SF、PV、UP × batch × block 数）
+- 任务数 = `1 + batch × num_blocks × 4`
+- batch=8 时产生 104 个任务，batch=16 时产生 208 个任务，**导致 AICPU 调度器 hang**
+
+### 新架构（batch_paged_attention/）
+
+- 将 batch 循环下放到 kernel 内部，每个 kernel 在一次任务中处理所有 batch
+- 任务数恒定为 **13 个**（1 HUB + num_blocks × 4 kernels），与 batch 大小无关
+- batch=256 仍然只有 13 个任务，彻底消除任务数爆炸问题
+
+## 性能数据
+
+| 用例 | Batch | 输出元素数 | 总 Exec (us) | 总 Sched (us) | 端到端 (us) | Avg Exec/task (us) | Avg Sched/task (us) | Exec/Sched % |
+|------|-------|-----------|-------------|--------------|------------|-------------------|-------------------|-------------|
+| Case1 | 1 | 256 | 38.48 | 262.50 | 160.64 | 2.96 | 20.19 | 14.66% |
+| CaseBatch2 | 2 | 512 | 53.64 | 239.38 | 163.92 | 4.13 | 18.41 | 22.41% |
+| CaseBatch4 | 4 | 1,024 | 85.10 | 327.60 | 206.26 | 6.55 | 25.20 | 25.98% |
+| CaseBatch8 | 8 | 2,048 | 122.24 | 346.02 | 198.28 | 9.40 | 26.62 | 35.33% |
+| CaseBatch16 | 16 | 4,096 | 233.72 | 459.14 | 259.96 | 17.98 | 35.32 | 50.90% |
+| CaseBatch32 | 32 | 8,192 | 442.74 | 729.48 | 386.68 | 34.06 | 56.11 | 60.69% |
+| CaseBatch64 | 64 | 16,384 | 870.38 | 1,104.70 | 562.80 | 66.95 | 84.98 | 78.79% |
+| CaseBatch128 | 128 | 32,768 | 1,719.82 | 1,942.32 | 949.16 | 132.29 | 149.41 | 88.54% |
+| CaseBatch256 | 256 | 65,536 | 3,470.28 | 3,720.64 | 1,763.20 | 266.94 | 286.20 | 93.27% |
+
+> 任务数在所有用例中均为 **13**。
+
+## 关键发现
+
+### 1. 调度效率随 batch 增大显著提升
+
+Exec/Sched 比率从 batch=1 的 14.66% 攀升到 batch=256 的 93.27%：
+
+```
+batch=1   ██░░░░░░░░░░░░░░░░░░  14.66%
+batch=2   ████░░░░░░░░░░░░░░░░  22.41%
+batch=4   █████░░░░░░░░░░░░░░░  25.98%
+batch=8   ███████░░░░░░░░░░░░░  35.33%
+batch=16  ██████████░░░░░░░░░░  50.90%
+batch=32  ████████████░░░░░░░░  60.69%
+batch=64  ███████████████░░░░░  78.79%
+batch=128 █████████████████░░░  88.54%
+batch=256 ██████████████████░░  93.27%
+```
+
+这说明调度开销（Head OH + Tail OH）是近似固定的，当 kernel 执行时间随 batch 增大而增长时，调度开销被有效摊销。
+
+### 2. 端到端延迟线性增长远低于 batch 增长倍数
+
+| 对比 | Batch 增长倍数 | 端到端增长倍数 |
+|------|--------------|--------------|
+| 1 → 16 | 16× | 1.6× |
+| 1 → 64 | 64× | 3.5× |
+| 1 → 256 | 256× | 11.0× |
+
+batch 增大 256 倍时，端到端延迟仅增加约 11 倍，体现了批处理架构对调度开销的高效摊销。
+
+### 3. 每任务平均调度时间基本稳定
+
+Avg Sched/task 从 20.19 us (batch=1) 缓慢增长到 286.20 us (batch=256)，其中增长部分几乎全部来自 kernel 执行时间的增加（Avg Exec 从 2.96 us 增长到 266.94 us），实际调度开销（Sched - Exec ≈ 17~19 us）保持相对稳定。
+
+### 4. 浮点非确定性
+
+大 batch 下偶现少量元素不匹配（<0.2%），为硬件浮点特性导致的间歇性行为，在 rtol=1e-2, atol=1e-2 容差下属于边界情况。重跑可通过。
+
+## 结论
+
+新的 batch_paged_attention 架构通过将 batch 循环下放到 kernel 内部，成功实现了：
+
+1. **任务数恒定**：消除了旧架构中任务数随 batch 线性增长的问题
+2. **支持大 batch**：从旧架构 batch=16 即 hang，到新架构 batch=256 正常运行
+3. **高效利用计算资源**：batch=256 时 93.27% 的时间用于实际计算
+4. **调度开销摊销**：固定的调度开销在大 batch 下被充分摊销
diff --git a/docs/case1_tail_oh_breakdown.md b/docs/case1_tail_oh_breakdown.md
new file mode 100644
index 000000000..01b3e29d9
--- /dev/null
+++ b/docs/case1_tail_oh_breakdown.md
@@ -0,0 +1,537 @@
+# Case1 Tail OH 完整 Breakdown
+
+> 数据来源：`PA_CASE=Case1 --enable-profiling`，16,704 tasks, 3 scheduler threads × 24 cores/thread
+
+---
+
+## Part 1: 每任务时间分解（Perf 采集数据）
+
+每个任务经历四段时间：
+
+```
+dispatch_time ──→ start_time ──→ end_time ──→ finish_time
+     │   Head OH    │   Exec     │   Tail OH     │
+```
+
+| 分量 | 总时间 (us) | 每任务平均 (us) | 占 Wall-clock |
+|------|------------|----------------|---------------|
+| Kernel Exec (end − start) | 29,743 | 1.78 | 82.9% |
+| Head OH (start − dispatch) | 30,672 | 1.84 | 85.5% |
+| **Tail OH (finish − end)** | **793,724** | **47.52** | **2212.7%** |
+
+- Wall-clock 总耗时：**35,872 us**
+- Tail OH 总和远超 wall-clock，因为 16,704 个任务的 Tail OH 是**各自独立累加**的（存在大量并行重叠）。
+
+---
+
+## Part 2: AICPU 调度器循环 CPU 时间 Breakdown（Device Log）
+
+### 2.1 三个调度线程概况
+
+| Thread | Loops | 完成任务数 | 总 CPU 时间 (us) |
+|--------|-------|-----------|-----------------|
+| T0 | 706 | 5,864 | 42,679 |
+| T1 | 690 | 5,663 | 42,648 |
+| T2 | 591 | 5,177 | 42,653 |
+| **SUM** | **1,987** | **16,704** | **127,979** |
+
+### 2.2 调度器循环各阶段 CPU 时间
+
+每次循环按顺序执行：
+
+```
+┌─ Phase 1: Complete ─┐  ┌─ Phase 2: Dispatch ─┐  ┌─ Scan ─┐  ┌─ Orch Drain ─┐  ┌─ Yield ─┐
+│ 遍历所有 24 个 core  │  │ 为空闲 core 派发任务 │  │ 发现新  │  │ 处理编排器  │  │ 无进展  │
+│ 检查 handshake      │  │ pop ready queue     │  │ 根任务  │  │ 就绪队列    │  │ 让出CPU │
+│ 记录 finish_ts      │  │ build_payload       │  │         │  │             │  │         │
+│ 解析 fanout 依赖    │  │ cache flush (dc+dsb)│  │         │  │             │  │         │
+└─────────────────────┘  └─────────────────────┘  └─────────┘  └─────────────┘  └─────────┘
+```
+
+| 阶段 | CPU 时间 (us) | 占比 | 每任务 (us) | 主要开销 |
+|------|--------------|------|------------|---------|
+| **Dispatch** | **79,587** | **62.2%** | **4.76** | cache flush (`dc cvac` + `dsb sy`) |
+| Complete | 43,968 | 34.4% | 2.63 | handshake 轮询 + fanout atomic ops |
+| Scan | 3,797 | 3.0% | 0.23 | 新任务发现 |
+| Orch Drain | 64 | 0.0% | 0.00 | 编排器就绪队列消费 |
+| Yield | 563 | 0.4% | 0.03 | thread_yield() |
+| **Total** | **127,979** | | **7.66** | |
+
+### 2.3 锁竞争
+
+| 分项 | 等锁 (us) | 持锁 (us) |
+|------|----------|----------|
+| Dispatch (pop ready_q) | 29,156 | 6,443 |
+| Complete (push ready_q) | 3,043 | 1,200 |
+| Scan | 394 | 335 |
+| **Total** | **32,592 (25.5%)** | **7,978 (6.2%)** |
+
+### 2.4 Fanout 依赖解析
+
+- 总遍历次数：22,088
+- 最长 fanout 链：35
+- 平均 fanout/任务：1.3
+- Fanout 锁竞争：spin=0us, hold=0us（无竞争）
+
+---
+
+## Part 3: Tail OH 分布
+
+| 分位数 | Tail OH (us) |
+|--------|-------------|
+| P10 | 33.4 |
+| P25 | 41.0 |
+| **P50** | **48.3** |
+| P75 | 54.6 |
+| P90 | 59.8 |
+| P95 | 62.9 |
+| P99 | 68.8 |
+| Max | 192.4 |
+| **Mean** | **47.5** |
+
+---
+
+## 关键问题解析
+
+### Q1: 为什么 Part 1 的每任务 Tail OH (47.52 us) 和 Part 2 的每任务 CPU 时间 (7.66 us) 对不上？
+
+**核心区别：Part 1 测的是 wall-clock 等待时间，Part 2 测的是 CPU 分摊成本。**
+
+调度器循环结构如下（以一个线程为例）：
+
+```
+Loop iteration #N (avg 64.4 us)
+├── Phase 1: 遍历 24 cores，检查哪些完成      ← 某个任务的 finish_ts 在这里记录
+├── Phase 2: 遍历 24 cores，派发就绪任务
+├── Scan: 扫描新提交的任务
+└── Yield (如果无进展)
+
+Loop iteration #N+1 ...
+├── Phase 1: 再次遍历 24 cores               ← 上一轮没检测到的任务，在这里被发现
+...
+```
+
+**每次循环迭代平均处理 ~8.4 个任务**（16,704 tasks ÷ 1,987 loops）。
+
+- **Part 2 的 7.66 us/task**：把一次循环 64.4 us 的 CPU 时间平摊到这 8.4 个任务上 → 64.4 ÷ 8.4 ≈ 7.66 us。这是 **AICPU 为每个任务付出的 CPU 成本**。
+
+- **Part 1 的 47.52 us/task**：每个任务从 kernel 执行完 (`end_time`) 到被 Phase 1 检测到 (`finish_time`) 的 **wall-clock 等待**。即使循环只花 7.66 us 的 CPU 在"你的"任务上，你仍需要等整个循环把其他 7-8 个任务的工作也做完。
+
+**类比**：银行柜台有 3 个窗口（3 threads），每个窗口每轮叫 8 个号。柜员处理你的业务只要 1 分钟（CPU cost），但你要等前面 7 个人都处理完才能轮到——排队等待 8 分钟（wall-clock wait）。
+
+数值验证：
+```
+每线程每循环时间 = 42,660 us ÷ 706 loops ≈ 60.4 us (T0)
+任务平均在循环中间某个时刻完成
+→ 平均等待 ≈ 0.5 ~ 0.8 × 循环时间 ≈ 30 ~ 50 us
+→ 实测 Tail OH 均值 47.5 us ✓
+```
+
+### Q2: 为什么 Part 3 的 Tail OH 这么长？为什么 Part 2 没有体现？
+
+**Part 2 的数字已经完整体现了原因，只是需要换一个视角来理解。**
+
+Part 2 告诉我们：**每次循环迭代耗时 64.4 us**。这 64.4 us 就是 Tail OH 的根本上限。
+
+Tail OH 长的原因是调度循环慢。循环慢的原因在 Part 2 中清晰可见：
+
+```
+每次循环迭代 64.4 us 的时间花在哪里：
+
+  Dispatch (cache flush):  62.2% → ~40 us    ← 主要瓶颈
+  Complete (poll+fanout):  34.4% → ~22 us
+  Scan + Yield:             3.4% →  ~2 us
+```
+
+**Dispatch 阶段的 cache flush 是根因**。每次派发任务需要：
+1. `dc cvac` 逐 cacheline 刷新 PTO2DispatchPayload (多次, ~160 bytes / 64 = 3 lines)
+2. `dc civac` 刷新 Handshake (1 次)
+3. `dsb sy` 全局屏障：**阻塞 AICPU 流水线直到所有 dc 操作完成**
+
+一个循环中可能派发 8+ 个任务，每个都要经历这套 flush。加上锁竞争（29,156 us 总等锁），Dispatch 消耗了大量时间。
+
+**Part 2 和 Part 3 的联系**：
+
+| Part 2 观察 | → | Part 3 后果 |
+|-------------|---|------------|
+| 循环迭代 64.4 us | → | P50 Tail OH ≈ 48.3 us（等待约 0.75 个循环） |
+| Dispatch 占 62% | → | 即使 kernel 已完成，Phase 1 还没到就被 Dispatch 阻塞 |
+| 锁竞争 25.5% | → | 3 线程争抢 ready_q 锁，进一步拉长循环 |
+| P99 = 68.8 us ≈ 1 loop | → | 极端情况刚好错过本轮 Phase 1，要等完整下一轮 |
+| Max = 192.4 us ≈ 3 loops | → | 偶发竞争或 OS 调度导致多轮延迟 |
+
+### 总结：Tail OH 的因果链
+
+```
+                    Root Cause
+                        │
+           ┌────────────┴────────────┐
+           │  每次 Dispatch 需要     │
+           │  dc cvac + dsb sy      │
+           │  刷新 AICPU cache      │
+           └────────────┬────────────┘
+                        │
+           ┌────────────┴────────────┐
+           │  Dispatch 占循环 62%    │
+           │  + 锁竞争 25.5%        │
+           └────────────┬────────────┘
+                        │
+           ┌────────────┴────────────┐
+           │  循环迭代 ~64 us        │
+           │  (Phase1+Phase2+Scan)   │
+           └────────────┬────────────┘
+                        │
+           ┌────────────┴────────────┐
+           │  任务完成后平均等       │
+           │  ~47.5 us 才被检测到   │
+           └────────────┬────────────┘
+                        │
+              Tail OH ≈ 47.5 us/task
+              (占端到端时间的主导部分)
+```
+
+### 潜在优化方向
+
+1. **减少 cache flush 次数**：批量派发后统一执行一次 `dsb sy`，而非每个任务一次（见下方风险分析）
+2. **减少 flush 范围**：只 flush 真正需要的 cacheline（如 tensor_copies 部分可能不需要每次 flush）
+3. **降低锁竞争**：增加 ready_q shard 数量（当前 shard 数可能不足）
+4. **缩短 Phase 1 + Phase 2 路径**：减少每轮遍历的 core 数（针对实际使用的 core 数优化）
+
+---
+
+## 优化方案风险分析：批量 `dsb sy`
+
+### 当前实现：每派发一个任务执行一次完整 flush
+
+```
+for each idle core with a ready task:
+    build_pto2_payload(payload, ...)       // 写 payload 数据
+    h->task = payload_addr                 // 写 handshake.task
+    h->task_status = 1                     // 写 handshake.task_status = 1 (启动信号)
+    dc cvac payload  (×3 cachelines)       // 刷 payload 到 HBM
+    dc civac handshake                     // 刷+失效 handshake 到 HBM
+    dsb sy                                 // 等待所有 dc 操作完成 ← 阻塞 ~3-5 us
+```
+
+### 提议优化：批量 flush
+
+```
+// Step 1: 批量写入所有任务
+for each idle core with a ready task:
+    build_pto2_payload(payload, ...)
+    h->task = payload_addr
+    h->task_status = 1
+    dc cvac payload  (×3 cachelines)
+    dc civac handshake
+    // 不等待 ←── 省掉 dsb sy
+
+// Step 2: 一次性等待全部完成
+dsb sy                                     // 所有 dc 操作在这里统一完成
+```
+
+### 风险 1 (致命)：Payload 与 Handshake 的到达顺序不可控
+
+**AICPU 和 AICore 之间通过 HBM 通信，不共享缓存。** 通信协议如下：
+
+```
+AICPU 端:                              AICore 端 (轮询循环):
+                                         while (true):
+  [1] 写 payload 到 AICPU cache             dcci(handshake)        // 失效自身缓存,从HBM读
+  [2] 写 handshake.task_status = 1           if task_status == 1:   // 看到启动信号?
+  [3] dc cvac payload   → 刷到 HBM              读 payload         // 通过 handshake.task 指针读
+  [4] dc civac handshake → 刷到 HBM              execute_task(payload)
+  [5] dsb sy            → 保证[3][4]完成         task_status = 0    // 通知完成
+```
+
+**关键不变式**：AICore 看到 `task_status=1` 时，`payload` 必须已经在 HBM 中。
+
+没有 `dsb sy` 时，`dc cvac`（payload）和 `dc civac`（handshake）仅仅是**发射**了缓存操作，
+ARM 架构**不保证**它们按程序顺序完成到 HBM。可能出现：
+
+```
+时间线:
+  AICPU cache ops issued:   dc cvac(payload_A)  dc civac(hank_A)  dc cvac(payload_B) ...
+  HBM 写入实际顺序:         hank_A arrives ✓    payload_B arrives   payload_A arrives (延迟)
+                                  ↑
+                            AICore 此时 dcci 看到 task_status=1
+                            但 payload_A 还没到 HBM → 读到旧数据 → 跳转到错误地址 → HANG
+```
+
+**结论：这是一个硬件级的数据竞争 (data race)，会导致随机 hang 或数据损坏。**
+
+> ARM Architecture Reference Manual (D5.10.2): "A data cache operation is only guaranteed
+> to be complete when a DSB is executed after the cache maintenance instruction."
+
+### 风险 2 (中等)：批量延迟导致 AICore 空转时间增加
+
+当前实现中，第一个 task dispatch 后立即 `dsb sy` 完成，AICore 可能在 ~3-5 us 后就开始执行。
+批量方案中，所有 task 的 flush 要等到最后一个 task 准备好后才统一 `dsb sy`。
+如果一次循环派发 8 个 task，前面几个 task 的 AICore 要多等几个 us：
+
+```
+当前:  dispatch_A → dsb(3us) → AICore_A starts │ dispatch_B → dsb(3us) → AICore_B starts
+批量:  dispatch_A → dispatch_B → ... → dsb(3us) → AICore_A starts, AICore_B starts (同时)
+                                                    ↑ AICore_A 多等了 N×(build_payload) 时间
+```
+
+对于执行时间 ~1.78 us 的短 kernel，这个额外等待可能显著。
+
+### 风险 3 (低)：Phase 1 重入 stale 读
+
+Phase 1 用 `dc civac` 在 handshake 上做 clean+invalidate。如果批量 dispatch 改变了
+handshake 的 flush 时机，Phase 1 下一次循环读到的可能是 AICPU 自身缓存中的旧值
+而非 AICore 写回 HBM 的 `task_status=0`。当前 per-task `dsb sy` 保证了 flush 完成后
+才进入下一轮循环；批量化后这个保证变弱。
+
+### 安全的折中方案
+
+如果要优化 `dsb sy` 开销，可以考虑以下方案：
+
+#### 方案 A：两阶段 flush（保持正确性，减少 dsb 次数）
+
+```
+// Step 1: 批量发射所有 payload flush
+for each task:
+    build_pto2_payload(...)
+    h->task = payload_addr
+    // 先不写 task_status
+    dc cvac payload
+
+// Step 2: 确保所有 payload 到达 HBM
+dsb sy                                  // ← 第一个 barrier
+
+// Step 3: 现在安全地设置启动信号并 flush handshake
+for each task:
+    h->task_status = 1
+    dc civac handshake
+
+// Step 4: 确保所有 handshake 到达 HBM
+dsb sy                                  // ← 第二个 barrier
+```
+
+**2 次 `dsb sy` 替代 N 次**，同时保证 payload 一定在 handshake 之前到达 HBM。
+
+> 预期收益：N 个 task 从 N 次 dsb (~N×3us) 降到 2 次 dsb (~6us)。
+> 但需要两次遍历 core 列表，增加代码复杂度。
+
+#### 方案 B：仅合并 dsb sy，保持 dc 操作分散
+
+```
+for each task:
+    build_pto2_payload(...)
+    h->task = payload_addr
+    h->task_status = 1
+    dc cvac payload
+    dc civac handshake
+    // 不 dsb
+
+dsb sy   // 循环最后统一 barrier
+```
+
+**风险：直接触发风险 1（payload/handshake 到达顺序不可控），不安全。**
+
+### 结论
+
+| 方案 | dsb 次数 | Payload→Handshake 顺序保证 | 安全性 |
+|------|---------|---------------------------|--------|
+| 当前 | N/循环 | ✅ 每个 task 独立保证 | ✅ 安全 |
+| 方案 A (两阶段) | 2/循环 | ✅ 全局 barrier 分隔 | ✅ 安全 |
+| 方案 B (末尾单 dsb) | 1/循环 | ❌ 无保证 | ❌ 可能 hang |
+
+**推荐方案 A**。主要风险是代码复杂度增加和"前几个 task 的 AICore 需多等几 us"（风险 2），
+但不会引入正确性问题。
+
+---
+
+## 优化方案风险分析：减少 flush 范围
+
+### 当前状态：flush 了什么、没 flush 什么
+
+代码注释声称有 **3 个区域**需要 flush，但实际只 flush 了 2 个：
+
+```
+注释列出的 3 个区域:                     实际代码:
+┌─────────────────────────────────────┐  ┌──────────────────────┐
+│ ① tensor_copies[] (~2688B, ~42 CL) │  │ ❌ 没有 flush        │
+│   Thread 3 (orch) 写入 buffer.addr │  │                      │
+│   AICore 通过 args[i] → Tensor*    │  │                      │
+│   间接读取                          │  │                      │
+├─────────────────────────────────────┤  ├──────────────────────┤
+│ ② PTO2DispatchPayload (~288B, ~5CL)│  │ ✅ dc cvac × ~5      │
+│   scheduler 线程 build_pto2_payload │  │                      │
+├─────────────────────────────────────┤  ├──────────────────────┤
+│ ③ Handshake (~64B, 1 CL)           │  │ ✅ dc civac × 1      │
+│   scheduler 线程写 task_status=1    │  │                      │
+└─────────────────────────────────────┘  └──────────────────────┘
+                                         + dsb sy × 1
+```
+
+**关键发现：`tensor_copies[]` 当前没有被 flush，但 Case1 大部分情况下能通过。**
+
+### AICore 读取 tensor_copies 的完整路径
+
+```
+AICPU 端 (Thread 3 编排器):
+  pto2_submit_task():
+    task->tensor_copies[i] = *params[i].tensor;     // [W1] 拷贝 Tensor 元数据
+    task->tensor_copies[i].buffer.addr = alloc_addr; // [W2] 填入 heap 分配地址
+    task->params[i].tensor = &task->tensor_copies[i]; // 指针重定向
+
+AICPU 端 (Thread 0/1/2 调度器):
+  build_pto2_payload():
+    out->args[n] = (uint64_t)task->params[i].tensor; // [W3] 把 &tensor_copies[i] 写入 payload
+  // dc cvac payload → 刷 args[] 到 HBM (包含指向 tensor_copies 的指针值)
+  // dc civac handshake → 刷 task_status=1
+  // dsb sy
+  // ⚠️ tensor_copies[i] 本身没有 flush！
+
+AICore 端:
+  aicore_executor:
+    dcci(handshake)                                   // 从 HBM 读 handshake
+    if (task_status == 1):
+      payload = (PTO2DispatchPayload*)handshake->task  // [R1] 读 payload (已 flush ✓)
+      kernel(payload->args)                            // args 包含 Tensor* 指针
+  
+  qk_matmul kernel:
+    Tensor* qi = (Tensor*)args[0];                    // [R2] 拿到指向 tensor_copies[0] 的指针
+    bfloat16_t* addr = (bfloat16_t*)qi->buffer.addr;  // [R3] 读 tensor_copies[0].buffer.addr ⚠️
+    uint64_t offset = qi->start_offset;                // [R4] 读 tensor_copies[0].start_offset ⚠️
+    // 如果 tensor_copies 没被 flush 到 HBM，
+    // AICore dcci 读到的是 HBM 中的旧值 → buffer.addr=0 → 访问地址 0 → HANG
+```
+
+### 为什么 Case1 没有 flush tensor_copies 但能工作？
+
+**时间窗口效应**：tensor_copies 由 Thread 3（编排器）写入，由 Thread 0/1/2（调度器）dispatch。
+中间经历了多个步骤：
+
+```
+Thread 3 写 tensor_copies [W1/W2]
+    │
+    ├── STEP 2: TensorMap lookup (遍历已有 tensor，查 fanin)
+    ├── STEP 3: Heap 分配 (可能 stall 等待空间)
+    ├── STEP 4: TensorMap insert
+    ├── STEP 5: 构建 fanin 链表
+    ├── atomic store fanin_count (SEQ_CST)
+    │
+    │  ···  其他任务也在被编排、提交  ···
+    │
+    ▼
+Thread 0/1/2 发现任务就绪，dispatch [W3]
+    │
+    ├── build_pto2_payload (读 task->params[i].tensor)
+    ├── dc cvac payload
+    ├── dc civac handshake
+    └── dsb sy
+```
+
+在 [W1/W2] 和 [W3] 之间通常有 **数十到数百 us** 的间隔（依赖解析、其他任务编排等）。
+AICPU 的 L1/L2 cache 是 write-back 策略，脏 cacheline 会在以下情况被自然逐出到 HBM：
+
+1. **Cache 容量压力**：后续大量内存访问（其他 task 的 tensor_copies、TensorMap 操作等）
+   会自然逐出旧的 cacheline
+2. **L2 cache 替换策略**：LRU 或 pseudo-LRU，早期写入的 tensor_copies 会被后续访问自然逐出
+3. **AICPU 集群内部一致性**：Thread 3 的写和 Thread 0/1/2 的读在同一 AICPU 集群内，
+   集群内是 cache-coherent 的，所以 scheduler 线程通过 `task->params[i].tensor` 读到的指针值是正确的
+
+**Case1 能工作的原因**：
+- Case1 每 batch 有 `64 × 1 × (2 blocks) = 128` 组 scope，每 scope 提交 5-6 个 task
+- 总共 ~16,704 个 task，大量 tensor_copies 写入造成足够的 cache 压力
+- 从 submit 到 dispatch 的时间窗口足够长，tensor_copies 已被自然逐出到 HBM
+
+### 什么情况下 tensor_copies 未 flush 会出问题？
+
+| 风险场景 | 说明 | 可能性 |
+|---------|------|--------|
+| **短依赖链** | 任务 A 的 fanin=0（根任务），submit 后立即可 dispatch，tensor_copies 可能还在 L1 | **高** |
+| **大 Tensor 结构体** | head_dim 较大时 Tensor 使用更多 strides/repeats 字段，脏数据量更大 | 中 |
+| **低 cache 压力** | 少量任务场景（block_num 较小），cache 不够满不触发自然逐出 | **高** |
+| **跨集群调度** | 如果 Thread 3 和 Thread 0 在不同 AICPU 集群（极端配置），无集群内一致性 | 低 |
+
+**特别注意：AIV_HUB 任务是每个 scope 的第一个任务（fanin_count=0），submit 后立即就绪。
+如果 Hub 的 tensor_copies（oi, li_update, mi_update 的 buffer.addr=0）还在 cache 中
+没有到 HBM，AICore 读到的可能是旧 slot 的残留值。不过 Hub kernel 是空函数，
+它的 tensor_copies 只是被下游引用（通过 TensorMap），不被 Hub kernel 自身读取。**
+
+### 优化方案分析
+
+#### 方案 1: 完全不 flush tensor_copies（当前做法）
+
+```
+风险:   依赖 AICPU cache 自然逐出，非确定性行为
+收益:   节省 ~42 × dc cvac / dispatch = 减少 Dispatch phase ~70% 的 dc 操作
+现状:   Case1 (16704 tasks, 长依赖链) 大部分通过
+```
+
+#### 方案 2: 每次 dispatch 都 flush 全部 tensor_copies（保守方案）
+
+```
+风险:   无正确性风险
+代价:   每次 dispatch 额外 ~42 次 dc cvac，Dispatch phase 耗时可能增加 ~5-8 us/task
+        循环迭代从 ~64 us 增到 ~100+ us，Tail OH 恶化 ~50%
+```
+
+#### 方案 3: 由编排器（Thread 3）在 submit_task 末尾 flush（推荐）
+
+```cpp
+// pto_orchestrator.cpp: pto2_submit_task() 末尾
+#ifdef __aarch64__
+    // Flush tensor_copies to HBM immediately after writing.
+    // Scheduler threads on the same AICPU cluster can read via cache coherency,
+    // but AICore reads from HBM via dcci — must ensure data is in HBM.
+    uintptr_t tc0 = (uintptr_t)task->tensor_copies & ~63ULL;
+    uintptr_t tc1 = (uintptr_t)(task->tensor_copies + task->param_count);
+    for (uintptr_t a = tc0; a < tc1; a += 64) {
+        __asm__ volatile("dc cvac, %0" :: "r"(a) : "memory");
+    }
+    __asm__ volatile("dsb sy" ::: "memory");
+#endif
+```
+
+```
+优点:   ① tensor_copies 在写入后立即 flush，到 dispatch 时一定在 HBM 中
+        ② dsb sy 在编排器线程执行，不阻塞调度器线程 → 不增加 Tail OH
+        ③ 编排器的 submit_task 本身就不在关键路径上（它是流水线式提交）
+风险:   编排器吞吐量略降（每次 submit 多 ~3-5 us），
+        但编排器通常领先调度器很多（orch_drain 只占 0.0%）
+```
+
+#### 方案 4: 仅 flush 实际使用的 tensor_copies（精确方案）
+
+```cpp
+// 只 flush param_count 个 tensor，而非固定 16 个
+for (int i = 0; i < task->param_count; i++) {
+    if (task->params[i].tensor == &task->tensor_copies[i]) {
+        uintptr_t a = (uintptr_t)&task->tensor_copies[i] & ~63ULL;
+        uintptr_t end = (uintptr_t)(&task->tensor_copies[i] + 1);
+        for (; a < end; a += 64)
+            __asm__ volatile("dc cvac, %0" :: "r"(a) : "memory");
+    }
+}
+```
+
+```
+优点:   QK kernel 只有 3 个 tensor param → ~8 CL 而非 42 CL
+风险:   代码复杂度增加，需要正确跟踪哪些 param 是 tensor
+```
+
+### 总结对比
+
+| 方案 | 正确性 | Tail OH 影响 | 编排器影响 | 复杂度 |
+|------|--------|-------------|-----------|--------|
+| 1 (不 flush) | ⚠️ 依赖自然逐出，非确定性 | 无 | 无 | 最低 |
+| 2 (dispatcher 全 flush) | ✅ | 恶化 ~50% | 无 | 低 |
+| **3 (orch flush)** | **✅** | **无** | **轻微 (~3-5 us/submit)** | **低** |
+| 4 (精确 flush) | ✅ | 无或极小 | 轻微 | 中 |
+
+**推荐方案 3**：在编排器 submit_task 末尾 flush tensor_copies。
+它将 flush 成本从调度器关键路径转移到编排器的非关键路径，
+既保证正确性又不增加 Tail OH。
+
+### 附注：tensor_copies 未 flush 的典型表现
+
+当 tensor_copies 未被 flush 到 HBM 时，AICore 通过 dcci 从 HBM 读到的 Tensor.buffer.addr
+可能是旧值（0 或上一轮残留地址），导致 kernel 读取到垃圾数据或 NaN，并通过
+pipeline (QK → SOFTMAX → PV → UPDATE) 传播到最终输出。
+
+**方案 3（在编排器中 flush tensor_copies）已实现，解决了此类问题。**
diff --git a/tools/scheduler_overhead_analysis.md b/tools/scheduler_overhead_analysis.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/tools/swimlane_converter.py b/tools/swimlane_converter.py
index 906321d8a..5c7bbc576 100644
--- a/tools/swimlane_converter.py
+++ b/tools/swimlane_converter.py
@@ -102,9 +102,43 @@ def load_kernel_config(config_path):
     return func_id_to_name
 
 
-def print_task_statistics(tasks, func_id_to_name=None):
+def parse_scheduler_overhead_from_device_log(log_path, task_count):
+    """Parse device log for PTO2 scheduler stats and return scheduler loop time per task (us).
+
+    Looks for lines like: "Thread N: PTO2 scheduler stats: ... total=32522.740us"
+    Sums the 'total' values (one per scheduler thread, typically 3) and divides by task_count.
+
+    Returns:
+        float: scheduler_us_per_task, or None if parsing failed / file missing
+    """
+    import re
+    path = Path(log_path)
+    if not path.exists() or task_count <= 0:
+        return None
+    pattern = re.compile(r'total=([\d.]+)us')
+    totals = []
+    try:
+        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+            for line in f:
+                m = pattern.search(line)
+                if m and 'PTO2 scheduler stats' in line:
+                    totals.append(float(m.group(1)))
+    except Exception:
+        return None
+    if not totals:
+        return None
+    return sum(totals) / task_count
+
+
+def print_task_statistics(tasks, func_id_to_name=None, scheduler_overhead_us_per_task=None):
     """Print task statistics grouped by func_id.
 
+    Exec = kernel execution time (end_time_us - start_time_us) on AICore.
+    Sched = AICPU view: finish_time_us - dispatch_time_us (includes head OH + Exec + tail OH).
+    High Sched with low Exec means scheduler/polling overhead (tail OH = finish_ts recorded
+    when the scheduler loop next sees the completed handshake; reordering the loop to process
+    completed tasks first reduces this).
+
     Args:
         tasks: List of task dicts
         func_id_to_name: Optional dict mapping func_id to function name
@@ -160,6 +194,7 @@ def print_task_statistics(tasks, func_id_to_name=None):
     # Print statistics
     print("\n" + "=" * 160)
     print("Task Statistics by Function")
+    print("  Exec = kernel time on AICore; Sched = AICPU dispatch->finish (incl. polling/tail OH)")
     print("=" * 160)
     print(f"{'Func_ID':<8} {'Func_Name':<12} {'Count':^6} {'Total_Exec/Sched(us)':^25} {'Avg_Exec/Sched(us)':^23} "
           f"{'Min_Exec/Sched(us)':^23} {'Max_Exec/Sched(us)':^23} {'Avg_Head/Tail_OH(us)':^23} {'Exec_%':^8}")
@@ -222,6 +257,18 @@ def print_task_statistics(tasks, func_id_to_name=None):
         total_test_time = max_finish_time - min_dispatch_time
         print(f"\nTotal Test Time: {total_test_time:.2f} us (from earliest dispatch to latest finish)")
 
+    # Task execution vs Scheduler overhead summary
+    if total_count > 0 and total_schedule_sum > 0:
+        avg_exec_us = total_duration / total_count
+        avg_sched_us = total_schedule_sum / total_count
+        exec_sched_ratio_pct = (total_duration / total_schedule_sum * 100) if total_schedule_sum > 0 else 0
+        print("\n--- Task execution vs Scheduler overhead ---")
+        print(f"  Per-task (all):  Avg Exec = {avg_exec_us:.2f} us,  Avg Sched (dispatch->finish) = {avg_sched_us:.2f} us,  Exec/Sched_ratio = {exec_sched_ratio_pct:.2f}%")
+        if scheduler_overhead_us_per_task is not None:
+            ratio_so = (scheduler_overhead_us_per_task / avg_exec_us) if avg_exec_us > 0 else 0
+            print(f"  Scheduler loop overhead (from device log): {scheduler_overhead_us_per_task:.2f} us/task  (scheduler_loop/Exec_ratio = {ratio_so:.2f})")
+        print("  (Sched = latency from dispatch to finish; scheduler loop overhead = AICPU scheduler thread CPU time per task, from device log.)")
+
     print("=" * 160)
 
 
@@ -489,6 +536,7 @@ def main():
     parser.add_argument('input', nargs='?', help='Input JSON file (.json). If not specified, uses the latest perf_swimlane_*.json file in outputs/ directory')
     parser.add_argument('-o', '--output', help='Output JSON file (default: outputs/merged_swimlane_<timestamp>.json)')
     parser.add_argument('-k', '--kernel-config', help='Path to kernel_config.py file for func_id to function name mapping')
+    parser.add_argument('--device-log', help='Path to device log file to extract scheduler loop overhead (PTO2 scheduler stats total=...us per thread)')
     parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
 
     args = parser.parse_args()
@@ -577,8 +625,15 @@ def main():
         print(f"  Output: {output_path}")
         print(f"\nTo visualize: Open https://ui.perfetto.dev/ and drag in {output_path}")
 
-        # Print task statistics
-        print_task_statistics(data['tasks'], func_names)
+        # Optional: parse scheduler overhead from device log
+        scheduler_overhead_us = None
+        if getattr(args, 'device_log', None):
+            scheduler_overhead_us = parse_scheduler_overhead_from_device_log(args.device_log, len(data['tasks']))
+            if args.verbose and scheduler_overhead_us is not None:
+                print(f"  Parsed scheduler loop overhead from device log: {scheduler_overhead_us:.2f} us/task")
+
+        # Print task statistics (incl. task execution vs scheduler overhead)
+        print_task_statistics(data['tasks'], func_names, scheduler_overhead_us_per_task=scheduler_overhead_us)
 
         return 0
 
diff --git a/tools/tail_oh_breakdown.py b/tools/tail_oh_breakdown.py
new file mode 100644
index 000000000..1d7eced91
--- /dev/null
+++ b/tools/tail_oh_breakdown.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""Tail OH breakdown analysis for PTO2 scheduler."""
+import json, os, re
+from collections import defaultdict
+
+# === Part 1: Per-task time breakdown from perf data ===
+perf_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'outputs')
+files = sorted([f for f in os.listdir(perf_dir) if f.startswith('perf_swimlane_')], reverse=True)
+with open(os.path.join(perf_dir, files[0])) as f:
+    data = json.load(f)
+tasks = data['tasks']
+func_names = {0:'QK', 1:'SF', 2:'PV', 3:'UP', 4:'AIC_HUB', 5:'AIV_HUB'}
+n_total = len(tasks)
+
+all_exec = sum(t['duration_us'] for t in tasks)
+all_head = sum(t['start_time_us'] - t['dispatch_time_us'] for t in tasks)
+all_tail = sum(t['finish_time_us'] - t['end_time_us'] for t in tasks)
+min_disp = min(t['dispatch_time_us'] for t in tasks)
+max_fin = max(t['finish_time_us'] for t in tasks)
+wall = max_fin - min_disp
+
+print('=' * 90)
+print('Part 1: Per-task time breakdown (from perf profiling data)')
+print('=' * 90)
+print(f'Total tasks: {n_total}')
+print(f'Wall-clock:  {wall:.1f} us')
+print()
+fmt = "  {:<35} {:>12} {:>14} {:>10}"
+print(fmt.format('Component', 'Total (us)', 'Avg/task (us)', '% of Wall'))
+print('  ' + '-' * 75)
+print(fmt.format('Kernel Exec (end-start)', f'{all_exec:.1f}', f'{all_exec/n_total:.2f}', f'{all_exec/wall*100:.1f}%'))
+print(fmt.format('Head OH (start-dispatch)', f'{all_head:.1f}', f'{all_head/n_total:.2f}', f'{all_head/wall*100:.1f}%'))
+print(fmt.format('Tail OH (finish-end)', f'{all_tail:.1f}', f'{all_tail/n_total:.2f}', f'{all_tail/wall*100:.1f}%'))
+print()
+
+# === Part 2: AICPU scheduler loop breakdown from device log ===
+log_dir = os.path.expanduser('~/ascend/log/debug/device-0')
+log_files = sorted([f for f in os.listdir(log_dir) if f.endswith('.log')], reverse=True)
+log_path = os.path.join(log_dir, log_files[0])
+
+threads = {}
+with open(log_path, 'r', errors='ignore') as f:
+    for line in f:
+        m = re.search(r'Thread (\d+): PTO2 scheduler stats: loops=(\d+), completed=(\d+), total=([\d.]+)us', line)
+        if m:
+            tid = int(m.group(1))
+            threads[tid] = {
+                'loops': int(m.group(2)),
+                'completed': int(m.group(3)),
+                'total_us': float(m.group(4))
+            }
+        m = re.search(r'Thread (\d+):   scan=([\d.]+)us \(([\d.]+)%\), orch_drain=([\d.]+)us \(([\d.]+)%\), complete=([\d.]+)us \(([\d.]+)%\), dispatch=([\d.]+)us \(([\d.]+)%\)', line)
+        if m:
+            tid = int(m.group(1))
+            threads[tid]['scan_us'] = float(m.group(2))
+            threads[tid]['scan_pct'] = float(m.group(3))
+            threads[tid]['orch_drain_us'] = float(m.group(4))
+            threads[tid]['orch_drain_pct'] = float(m.group(5))
+            threads[tid]['complete_us'] = float(m.group(6))
+            threads[tid]['complete_pct'] = float(m.group(7))
+            threads[tid]['dispatch_us'] = float(m.group(8))
+            threads[tid]['dispatch_pct'] = float(m.group(9))
+        m = re.search(r'Thread (\d+):   yield=([\d.]+)us \(([\d.]+)%, (\d+) calls', line)
+        if m:
+            tid = int(m.group(1))
+            threads[tid]['yield_us'] = float(m.group(2))
+            threads[tid]['yield_pct'] = float(m.group(3))
+            threads[tid]['yield_calls'] = int(m.group(4))
+        m = re.search(r'Thread (\d+):   lock\(ready_q\): wait=(\d+)us hold=(\d+)us \(scan=([\d]+)/([\d]+) orch=([\d]+)/([\d]+) complete=([\d]+)/([\d]+) dispatch=([\d]+)/([\d]+)\)', line)
+        if m:
+            tid = int(m.group(1))
+            threads[tid]['lock_wait_us'] = int(m.group(2))
+            threads[tid]['lock_hold_us'] = int(m.group(3))
+            threads[tid]['lock_scan_wait'] = int(m.group(4))
+            threads[tid]['lock_scan_hold'] = int(m.group(5))
+            threads[tid]['lock_complete_wait'] = int(m.group(8))
+            threads[tid]['lock_complete_hold'] = int(m.group(9))
+            threads[tid]['lock_dispatch_wait'] = int(m.group(10))
+            threads[tid]['lock_dispatch_hold'] = int(m.group(11))
+        m = re.search(r'Thread (\d+):   fanout: total_traversed=(\d+), max_len=(\d+), avg=([\d.]+)', line)
+        if m:
+            tid = int(m.group(1))
+            threads[tid]['fanout_total'] = int(m.group(2))
+            threads[tid]['fanout_max'] = int(m.group(3))
+            threads[tid]['fanout_avg'] = float(m.group(4))
+        m = re.search(r'Thread (\d+):   lock\(fanout\): spin=(\d+)us hold=(\d+)us', line)
+        if m:
+            tid = int(m.group(1))
+            threads[tid]['fanout_spin_us'] = int(m.group(2))
+            threads[tid]['fanout_hold_us'] = int(m.group(3))
+
+print('=' * 90)
+print('Part 2: AICPU scheduler loop breakdown (from device log)')
+print('  3 scheduler threads, each manages 8 AIC + 16 AIV cores')
+print('=' * 90)
+print()
+fmt2 = "  {:<10} {:>7} {:>10} {:>11}"
+print(fmt2.format('Thread', 'Loops', 'Completed', 'Total (us)'))
+print('  ' + '-' * 42)
+for tid in sorted(threads.keys()):
+    t = threads[tid]
+    print(fmt2.format('T'+str(tid), t['loops'], t['completed'], f"{t['total_us']:.1f}"))
+total_us = sum(t['total_us'] for t in threads.values())
+total_completed = sum(t['completed'] for t in threads.values())
+total_loops = sum(t['loops'] for t in threads.values())
+print(fmt2.format('SUM', total_loops, total_completed, f'{total_us:.1f}'))
+print()
+
+phases = ['scan', 'orch_drain', 'complete', 'dispatch', 'yield']
+phase_labels = {
+    'scan':       'Scan (discover new root tasks)',
+    'orch_drain': 'Orch drain (wait for orchestrator)',
+    'complete':   'Complete (poll handshake, resolve fanout)',
+    'dispatch':   'Dispatch (pop queue, build payload, flush)',
+    'yield':      'Yield (no progress, thread_yield)',
+}
+
+fmt3 = "  {:<50} {:>11} {:>10} {:>14}"
+print(fmt3.format('Phase', 'Total (us)', '% of total', 'Avg/task (us)'))
+print('  ' + '-' * 89)
+for p in phases:
+    key = p + '_us'
+    tot = sum(t.get(key, 0) for t in threads.values())
+    pct = tot / total_us * 100
+    avg = tot / total_completed if total_completed > 0 else 0
+    print(fmt3.format(phase_labels[p], f'{tot:.1f}', f'{pct:.1f}%', f'{avg:.2f}'))
+
+print()
+
+# Lock contention breakdown
+fmt4 = "  {:<50} {:>11} {:>10}"
+print(fmt4.format('Lock contention (ready_q)', 'Total (us)', '% of total'))
+print('  ' + '-' * 75)
+lock_wait = sum(t.get('lock_wait_us', 0) for t in threads.values())
+lock_hold = sum(t.get('lock_hold_us', 0) for t in threads.values())
+print(fmt4.format('  wait (spinning for lock)', str(lock_wait), f'{lock_wait/total_us*100:.1f}%'))
+print(fmt4.format('  hold (inside critical section)', str(lock_hold), f'{lock_hold/total_us*100:.1f}%'))
+print()
+
+# Lock wait breakdown by phase
+print('  Lock wait by phase:')
+for p in ['scan', 'complete', 'dispatch']:
+    w = sum(t.get(f'lock_{p}_wait', 0) for t in threads.values())
+    h = sum(t.get(f'lock_{p}_hold', 0) for t in threads.values())
+    print(f'    {p:<12}  wait={w:>6} us  hold={h:>6} us')
+print()
+
+# Fanout
+fanout_total = sum(t.get('fanout_total', 0) for t in threads.values())
+fanout_max = max(t.get('fanout_max', 0) for t in threads.values())
+fanout_spin = sum(t.get('fanout_spin_us', 0) for t in threads.values())
+fanout_hold = sum(t.get('fanout_hold_us', 0) for t in threads.values())
+print(f'  Fanout traversal: total={fanout_total}, max_len={fanout_max}, lock spin={fanout_spin}us hold={fanout_hold}us')
+
+print()
+print('=' * 90)
+print('Part 3: Tail OH distribution & cause analysis')
+print('=' * 90)
+print()
+
+tails = [t['finish_time_us'] - t['end_time_us'] for t in tasks]
+tails.sort()
+n = len(tails)
+print(f'  Tail OH distribution (N={n}):')
+for pct_val in [10, 25, 50, 75, 90, 95, 99]:
+    idx = min(int(n * pct_val / 100), n - 1)
+    print(f'    P{pct_val:<4}  {tails[idx]:>7.1f} us')
+print(f'    Max:   {tails[-1]:>7.1f} us')
+print(f'    Mean:  {sum(tails)/n:>7.1f} us')
+print()
+
+# Scheduler loop time = where Tail OH comes from
+avg_loop_us = total_us / total_loops
+complete_sum = sum(t.get('complete_us', 0) for t in threads.values())
+dispatch_sum = sum(t.get('dispatch_us', 0) for t in threads.values())
+print(f'  Avg scheduler loop iteration: {avg_loop_us:.1f} us (= min Tail OH granularity)')
+print(f'  With 3 threads sharing {total_loops} loops over {total_us/3:.0f} us wall each:')
+print()
+print(f'  Tail OH breakdown (per completed task):')
+complete_per_task = complete_sum / total_completed
+dispatch_per_task = dispatch_sum / total_completed
+scan_per_task = sum(t.get('scan_us', 0) for t in threads.values()) / total_completed
+yield_per_task = sum(t.get('yield_us', 0) for t in threads.values()) / total_completed
+print(f'    1. Dispatch phase (build payload + cache flush):  {dispatch_per_task:.2f} us/task  ({dispatch_sum/total_us*100:.1f}% of scheduler CPU)')
+print(f'       - Lock wait (ready_q pop):                     {sum(t.get("lock_dispatch_wait",0) for t in threads.values())/total_completed:.2f} us/task')
+print(f'       - Lock hold + build + dc cvac/civac + dsb sy:  {(dispatch_sum - sum(t.get("lock_dispatch_wait",0) for t in threads.values()))/total_completed:.2f} us/task')
+print(f'    2. Complete phase (poll + fanout resolve):         {complete_per_task:.2f} us/task  ({complete_sum/total_us*100:.1f}% of scheduler CPU)')
+print(f'       - Lock wait (ready_q push):                    {sum(t.get("lock_complete_wait",0) for t in threads.values())/total_completed:.2f} us/task')
+print(f'       - Fanout traversal + atomic ops:               {(complete_sum - sum(t.get("lock_complete_wait",0) for t in threads.values()))/total_completed:.2f} us/task')
+print(f'    3. Scan phase (new task discovery):               {scan_per_task:.2f} us/task')
+print(f'    4. Yield (idle):                                  {yield_per_task:.2f} us/task')
+print()
+print(f'  Key insight: Dispatch phase consumes ~62% of scheduler CPU.')
+print(f'  Within dispatch, cache flush (dc cvac + dsb sy) is the dominant cost.')
+print(f'  Each dsb sy stalls the AICPU pipeline until all prior dc ops complete.')
+print('=' * 90)

From eec24dc62995e6afffdebbf5947b4352650b96ba Mon Sep 17 00:00:00 2001
From: liaoheng <hengliao1972@163.com>
Date: Thu, 26 Feb 2026 15:08:25 +0800
Subject: [PATCH 5/6] Feature: support per-batch variable sequence length and
 chunked batching

Add support for variable sequence lengths across batches in paged attention,
controlled via PA_SEQ_LEN environment variable. Also introduces IN_CORE_BATCH
chunking for improved multi-core parallelism and configurable ready queue shards.

Key changes:
- golden.py: PA_SEQ_LEN env var for per-batch variable sequence lengths
  (e.g. PA_SEQ_LEN=33,64,17,128 for 4 different lengths)
- aiv_softmax_prepare.cpp: fix valid_len=0 bug when block is beyond a
  batch's sequence, output mij=-1e30/lij=0/pij=0 to avoid NaN from
  exp(-inf - (-inf))
- Orchestrator: IN_CORE_BATCH=16 chunking splits large batches into
  parallel chunks across multiple cores
- All kernels: accept batch_start offset for chunked processing
- aicpu_executor: configurable ready queue shards via PTO2_READY_QUEUE_SHARDS
  env var, passed through Runtime struct from host to device
---
 .../batch_paged_attention/golden.py           |  44 +++-
 .../kernels/aic/aic_pv_matmul.cpp             |   8 +-
 .../kernels/aic/aic_qk_matmul.cpp             |  11 +-
 .../kernels/aiv/aiv_online_update.cpp         |   8 +-
 .../kernels/aiv/aiv_softmax_prepare.cpp       |  40 +++-
 .../orchestration/paged_attention_orch.cpp    | 202 ++++++++++--------
 .../aicpu/aicpu_executor.cpp                  |  62 +++---
 .../host/runtime_maker.cpp                    |  17 ++
 .../runtime/runtime.cpp                       |   1 +
 .../runtime/runtime.h                         |   1 +
 10 files changed, 256 insertions(+), 138 deletions(-)

diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py b/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py
index f9f42b343..33cb08d7d 100644
--- a/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py
@@ -119,7 +119,23 @@
 
 # Select case by env var PA_CASE, default to Case1
 _selected = os.environ.get("PA_CASE", "Case1")
-PARAMS_LIST = [{"name": _selected, **ALL_CASES[_selected]}]
+_params = {"name": _selected, **ALL_CASES[_selected]}
+
+# Override context_len from env: PA_SEQ_LEN=33 (uniform) or PA_SEQ_LEN=33,64,128 (per-batch variable)
+_seq_env = os.environ.get("PA_SEQ_LEN")
+if _seq_env:
+    _seq_vals = [int(x.strip()) for x in _seq_env.split(",")]
+    if len(_seq_vals) == 1:
+        _params["context_len"] = _seq_vals[0]
+        _params["context_lens_list"] = None
+    else:
+        _params["context_len"] = max(_seq_vals)
+        _params["context_lens_list"] = _seq_vals
+    _max_seq = max(_seq_vals)
+    if _max_seq > _params["max_model_len"]:
+        _params["max_model_len"] = ((_max_seq + _params["block_size"] - 1) // _params["block_size"]) * _params["block_size"]
+
+PARAMS_LIST = [_params]
 
 
 def generate_inputs(params: dict) -> dict:
@@ -131,13 +147,27 @@ def generate_inputs(params: dict) -> dict:
     block_size = params["block_size"]
     context_len = params["context_len"]
     max_model_len = params["max_model_len"]
+    context_lens_list = params.get("context_lens_list")
 
     max_num_blocks_per_req = max_model_len // block_size
-    cur_valid_blocks = (context_len + block_size - 1) // block_size
-    total_blocks = batch * cur_valid_blocks
     scale_value = 1.0
     scale_bits = struct.unpack('I', struct.pack('f', scale_value))[0]
 
+    # Build per-batch context_lens tensor
+    if context_lens_list is not None:
+        seq_vals = context_lens_list
+        if len(seq_vals) < batch:
+            seq_vals = (seq_vals * ((batch + len(seq_vals) - 1) // len(seq_vals)))[:batch]
+        elif len(seq_vals) > batch:
+            seq_vals = seq_vals[:batch]
+        context_lens = torch.tensor(seq_vals, dtype=torch.int32)
+    else:
+        context_lens = torch.full((batch,), context_len, dtype=torch.int32)
+
+    max_ctx = int(context_lens.max().item())
+    cur_valid_blocks = (max_ctx + block_size - 1) // block_size
+    total_blocks = batch * cur_valid_blocks
+
     # Random block table: (batch, max_num_blocks_per_req) int32
     block_table = torch.randint(
         0,
@@ -146,9 +176,6 @@ def generate_inputs(params: dict) -> dict:
         dtype=torch.int32,
     )
 
-    # Context lens: all = context_len
-    context_lens = torch.full((batch,), context_len, dtype=torch.int32)
-
     config = torch.tensor(
         [batch, num_heads, kv_head_num, head_dim, block_size,
          max_num_blocks_per_req, scale_bits],
@@ -326,7 +353,10 @@ def compute_golden(tensors: dict, params: dict) -> None:
     print(f"=== Paged Attention Golden Test ({params['name']}) ===")
     print(f"batch={params['batch']}, num_heads={params['num_heads']}, head_dim={params['head_dim']}")
     print(f"kv_head_num={params['kv_head_num']}, block_size={params['block_size']}")
-    print(f"context_len={params['context_len']}")
+    if params.get('context_lens_list'):
+        print(f"context_lens (variable): {params['context_lens_list'][:8]}{'...' if len(params['context_lens_list']) > 8 else ''}")
+    else:
+        print(f"context_len={params['context_len']}")
 
     max_num_blocks = params['max_model_len'] // params['block_size']
     q_tile = min(params['num_heads'], 128)
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
index bea8c7305..466751ac0 100644
--- a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -28,7 +28,8 @@ static __aicore__ void pv_matmul_batch_impl(
     uint64_t block_table_ptr,
     uint64_t batch_count,
     uint64_t block_idx,
-    uint64_t block_num) {
+    uint64_t block_num,
+    uint64_t batch_start) {
 
     __gm__ half* pij_base = reinterpret_cast<__gm__ half*>(pij_batch->buffer.addr);
     __gm__ half* val_base = reinterpret_cast<__gm__ half*>(value_cache->buffer.addr);
@@ -60,7 +61,7 @@ static __aicore__ void pv_matmul_batch_impl(
 
     for (uint64_t b = 0; b < batch_count; b++) {
         __gm__ half* pij_addr = pij_base + b * M * K;
-        int32_t phys_block = bt[b * block_num + block_idx];
+        int32_t phys_block = bt[(batch_start + b) * block_num + block_idx];
         __gm__ half* vj_addr = val_base + (uint64_t)phys_block * K * N;
         __gm__ float* oi_addr = oi_base + b * M * N;
 
@@ -101,8 +102,9 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     uint64_t batch_count = static_cast<uint64_t>(args[4]);
     uint64_t block_idx = static_cast<uint64_t>(args[5]);
     uint64_t block_num = static_cast<uint64_t>(args[6]);
+    uint64_t batch_start = static_cast<uint64_t>(args[7]);
 
     pv_matmul_batch_impl<16, 16, 16>(
         pij_batch, value_cache, oi_new_batch,
-        block_table_ptr, batch_count, block_idx, block_num);
+        block_table_ptr, batch_count, block_idx, block_num, batch_start);
 }
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
index ae467d724..00451889b 100644
--- a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -30,7 +30,8 @@ static __aicore__ void qk_matmul_batch_impl(
     uint64_t block_idx,
     uint64_t q_offset,
     uint64_t block_num,
-    uint64_t num_heads) {
+    uint64_t num_heads,
+    uint64_t batch_start) {
 
     __gm__ half* query_base = reinterpret_cast<__gm__ half*>(query->buffer.addr);
     __gm__ half* key_base = reinterpret_cast<__gm__ half*>(key_cache->buffer.addr);
@@ -61,8 +62,8 @@ static __aicore__ void qk_matmul_batch_impl(
     TASSIGN(cTile, 0x0);
 
     for (uint64_t b = 0; b < batch_count; b++) {
-        __gm__ half* qi_addr = query_base + (b * num_heads + q_offset) * K;
-        int32_t phys_block = bt[b * block_num + block_idx];
+        __gm__ half* qi_addr = query_base + ((batch_start + b) * num_heads + q_offset) * K;
+        int32_t phys_block = bt[(batch_start + b) * block_num + block_idx];
         __gm__ half* kj_addr = key_base + (uint64_t)phys_block * N * K;
         __gm__ float* sij_addr = sij_base + b * M * N;
 
@@ -105,8 +106,10 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     uint64_t q_offset = static_cast<uint64_t>(args[6]);
     uint64_t block_num = static_cast<uint64_t>(args[7]);
     uint64_t num_heads = static_cast<uint64_t>(args[8]);
+    uint64_t batch_start = static_cast<uint64_t>(args[9]);
 
     qk_matmul_batch_impl<16, 16, 16>(
         query, key_cache, sij_batch,
-        block_table_ptr, batch_count, block_idx, q_offset, block_num, num_heads);
+        block_table_ptr, batch_count, block_idx, q_offset, block_num, num_heads,
+        batch_start);
 }
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
index f0c082e3c..388a73be6 100644
--- a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -38,7 +38,8 @@ static __aicore__ void online_update_batch_impl(
     uint64_t is_last,
     uint64_t batch_count,
     uint64_t q_offset,
-    uint64_t num_heads) {
+    uint64_t num_heads,
+    uint64_t batch_start) {
 
     __gm__ float* mij_base = reinterpret_cast<__gm__ float*>(mij_batch->buffer.addr);
     __gm__ float* lij_base = reinterpret_cast<__gm__ float*>(lij_batch->buffer.addr);
@@ -95,7 +96,7 @@ static __aicore__ void online_update_batch_impl(
         __gm__ float* mi_ptr = mi_base + b * M;
         __gm__ float* li_ptr = li_base + b * M;
         __gm__ float* oi_ptr = oi_base + b * M * N;
-        __gm__ float* dst_ptr = out_base + (b * num_heads + q_offset) * N;
+        __gm__ float* dst_ptr = out_base + ((batch_start + b) * num_heads + q_offset) * N;
 
         GlobalDataMxN oiNewGlobal(oi_new_ptr);
         GlobalDataMxN oiGlobal(oi_ptr);
@@ -214,9 +215,10 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     uint64_t batch_count = static_cast<uint64_t>(args[9]);
     uint64_t q_offset = static_cast<uint64_t>(args[10]);
     uint64_t num_heads = static_cast<uint64_t>(args[11]);
+    uint64_t batch_start = static_cast<uint64_t>(args[12]);
 
     online_update_batch_impl<16, 16>(
         mij_batch, lij_batch, oi_new_batch,
         mi_batch, li_batch, oi_batch, out,
-        is_first, is_last, batch_count, q_offset, num_heads);
+        is_first, is_last, batch_count, q_offset, num_heads, batch_start);
 }
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index 656271423..8e611577f 100644
--- a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -33,7 +33,8 @@ static __aicore__ void softmax_prepare_batch_impl(
     float scale_value,
     uint64_t context_lens_ptr,
     uint64_t batch_count,
-    uint64_t block_idx) {
+    uint64_t block_idx,
+    uint64_t batch_start) {
 
     __gm__ float* sij_base = reinterpret_cast<__gm__ float*>(sij_batch->buffer.addr);
     __gm__ half* pij_base = reinterpret_cast<__gm__ half*>(pij_batch->buffer.addr);
@@ -71,12 +72,14 @@ static __aicore__ void softmax_prepare_batch_impl(
     TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
 
     for (uint64_t b = 0; b < batch_count; b++) {
-        int32_t cur_seq = ctx_lens[b];
+        int32_t cur_seq = ctx_lens[batch_start + b];
         uint64_t start = block_idx * N;
-        uint64_t valid_len = N;
-        if (start < (uint64_t)cur_seq) {
+        uint64_t valid_len;
+        if (start >= (uint64_t)cur_seq) {
+            valid_len = 0;
+        } else {
             uint64_t remaining = (uint64_t)cur_seq - start;
-            if (remaining < (uint64_t)N) valid_len = remaining;
+            valid_len = (remaining < (uint64_t)N) ? remaining : N;
         }
 
         __gm__ float* sij_addr = sij_base + b * M * N;
@@ -89,6 +92,30 @@ static __aicore__ void softmax_prepare_batch_impl(
         GlobalScalarDN mijGlobal(mij_addr);
         GlobalScalarDN lijGlobal(lij_addr);
 
+        if (valid_len == 0) {
+            // Block entirely beyond sequence: write mij=-1e30, lij=0, pij=0
+            // Use -1e30 instead of -inf to avoid NaN in online_update (exp(-inf - (-inf)) = NaN)
+            constexpr float NEG_LARGE = -1e30f;
+            for (int i = 0; i < kAlignedRows; i++) {
+                maxTile.SetValue(i, NEG_LARGE);
+                sumTile.SetValue(i, 0.0f);
+            }
+            for (int i = 0; i < M * N; i++) {
+                pijF16Tile.SetValue(i, static_cast<half>(0.0f));
+            }
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(mijGlobal, maxTile);
+            TSTORE(lijGlobal, sumTile);
+            TSTORE(pijGlobal, pijF16Tile);
+
+            if (b + 1 < batch_count) {
+                pipe_barrier(PIPE_ALL);
+            }
+            continue;
+        }
+
         TLOAD(sijTile, sijGlobal);
         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
@@ -139,8 +166,9 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     uint64_t context_lens_ptr = static_cast<uint64_t>(args[5]);
     uint64_t batch_count = static_cast<uint64_t>(args[6]);
     uint64_t block_idx = static_cast<uint64_t>(args[7]);
+    uint64_t batch_start = static_cast<uint64_t>(args[8]);
 
     softmax_prepare_batch_impl<16, 16>(
         sij_batch, pij_batch, mij_batch, lij_batch,
-        scale_value, context_lens_ptr, batch_count, block_idx);
+        scale_value, context_lens_ptr, batch_count, block_idx, batch_start);
 }
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 29964f767..dad4716b0 100644
--- a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -1,24 +1,30 @@
 /**
  * Batch Paged Attention Orchestration Function - 16x16 Version
  *
- * Batched architecture: the batch loop is moved inside kernels,
- * so task count is fixed at 1 + max_bn * 4 regardless of batch size.
+ * Chunked batched architecture: the full batch is split into chunks of
+ * IN_CORE_BATCH size. Each chunk's QK/SF/PV/UP tasks are independent
+ * and can be scheduled to different cores in parallel.
+ *
+ * Task count = num_chunks * (1 + max_bn * 4), where
+ *   num_chunks = ceil(batch / IN_CORE_BATCH)
+ *
+ * For batch <= IN_CORE_BATCH, behavior is identical to the non-chunked version.
  *
  * Memory Layout:
  *   Query: (batch * num_heads, head_dim) fp16
  *   Key:   (total_blocks, block_size, head_dim) fp16 (stored as K^T for QK)
  *   Value: (total_blocks, block_size, head_dim) fp16
  *
- * Intermediate batched tensors (contiguous across batch dimension):
- *   sij_batch:     (batch * q_tile, block_size)  fp32
- *   pij_batch:     (batch * q_tile, block_size)  fp16
- *   mij/lij_batch: (batch * q_tile)              fp32
- *   oi_new_batch:  (batch * q_tile, head_dim)    fp32
- *   oi_batch:      (batch * q_tile, head_dim)    fp32  accumulator
- *   mi/li_batch:   (batch * q_tile)              fp32  accumulator
+ * Per-chunk intermediate tensors (contiguous across chunk_bc dimension):
+ *   sij:     (chunk_bc * q_tile, block_size)  fp32
+ *   pij:     (chunk_bc * q_tile, block_size)  fp16
+ *   mij/lij: (chunk_bc * q_tile)              fp32
+ *   oi_new:  (chunk_bc * q_tile, head_dim)    fp32
+ *   oi:      (chunk_bc * q_tile, head_dim)    fp32  accumulator
+ *   mi/li:   (chunk_bc * q_tile)              fp32  accumulator
  *
- * Kernels receive global tensors + scalar metadata and compute per-batch
- * addresses internally, reusing L1/L0/UB tile buffers across iterations.
+ * Kernels receive global tensors + scalar metadata (including batch_start)
+ * and compute per-batch addresses internally.
  */
 
 #include <stddef.h>
@@ -106,93 +112,107 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
     uint64_t bt_addr = (uint64_t)(uintptr_t)host_block_table;
     uint64_t cl_addr = (uint64_t)(uintptr_t)host_context_lens;
 
+    uint64_t IN_CORE_BATCH = 16;
+    uint64_t num_chunks = (batch + IN_CORE_BATCH - 1) / IN_CORE_BATCH;
+
     for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
-        PTO2_SCOPE(rt) {
-            uint64_t q_offset = q_idx * q_tile;
-
-            uint64_t oi_acc_shapes[2] = {batch * q_tile, head_dim};
-            uint64_t scalar_acc_shapes[1] = {batch * q_tile};
-            Tensor oi_batch = make_tensor(oi_acc_shapes, 2, DataType::FLOAT32);
-            Tensor li_batch = make_tensor(scalar_acc_shapes, 1, DataType::FLOAT32);
-            Tensor mi_batch = make_tensor(scalar_acc_shapes, 1, DataType::FLOAT32);
-
-            PTOParam params_hub[] = {
-                make_output_param(oi_batch),
-                make_output_param(li_batch),
-                make_output_param(mi_batch),
-            };
-            pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_hub, 3);
-
-            for (uint64_t bn = 0; bn < max_bn; bn++) {
-                uint64_t sij_shapes[2] = {batch * q_tile, block_size};
-                uint64_t vec_shapes[1] = {batch * q_tile};
-                uint64_t oi_new_shapes[2] = {batch * q_tile, head_dim};
-
-                Tensor sij_b = make_tensor(sij_shapes, 2, DataType::FLOAT32);
-                Tensor pij_b = make_tensor(sij_shapes, 2, data_type);
-                Tensor mij_b = make_tensor(vec_shapes, 1, DataType::FLOAT32);
-                Tensor lij_b = make_tensor(vec_shapes, 1, DataType::FLOAT32);
-                Tensor oi_new_b = make_tensor(oi_new_shapes, 2, DataType::FLOAT32);
-
-                PTOParam params_qk[] = {
-                    make_input_param(query),
-                    make_input_param(key_cache),
-                    make_output_param(sij_b),
-                    make_scalar_param(bt_addr),
-                    make_scalar_param(batch),
-                    make_scalar_param(bn),
-                    make_scalar_param(q_offset),
-                    make_scalar_param(block_num),
-                    make_scalar_param(num_heads),
-                };
-                pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 9);
-
-                PTOParam params_sf[] = {
-                    make_input_param(sij_b),
-                    make_output_param(pij_b),
-                    make_output_param(mij_b),
-                    make_output_param(lij_b),
-                    make_scalar_param(float_to_u64(scale_value)),
-                    make_scalar_param(cl_addr),
-                    make_scalar_param(batch),
-                    make_scalar_param(bn),
-                };
-                pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 8);
-
-                PTOParam params_pv[] = {
-                    make_input_param(pij_b),
-                    make_input_param(value_cache),
-                    make_output_param(oi_new_b),
-                    make_scalar_param(bt_addr),
-                    make_scalar_param(batch),
-                    make_scalar_param(bn),
-                    make_scalar_param(block_num),
-                };
-                pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 7);
-
-                uint64_t is_first = (bn == 0) ? 1 : 0;
-                uint64_t is_last = (bn == max_bn - 1) ? 1 : 0;
-                PTOParam params_up[] = {
-                    make_input_param(mij_b),
-                    make_input_param(lij_b),
-                    make_input_param(oi_new_b),
-                    make_inout_param(mi_batch),
-                    make_inout_param(li_batch),
+        uint64_t q_offset = q_idx * q_tile;
+
+        for (uint64_t batch_start = 0; batch_start < batch; batch_start += IN_CORE_BATCH) {
+            uint64_t chunk_bc = batch - batch_start;
+            if (chunk_bc > IN_CORE_BATCH) chunk_bc = IN_CORE_BATCH;
+
+            PTO2_SCOPE(rt) {
+                uint64_t oi_acc_shapes[2] = {chunk_bc * q_tile, head_dim};
+                uint64_t scalar_acc_shapes[1] = {chunk_bc * q_tile};
+                Tensor oi_batch = make_tensor(oi_acc_shapes, 2, DataType::FLOAT32);
+                Tensor li_batch = make_tensor(scalar_acc_shapes, 1, DataType::FLOAT32);
+                Tensor mi_batch = make_tensor(scalar_acc_shapes, 1, DataType::FLOAT32);
+
+                PTOParam params_hub[] = {
                     make_output_param(oi_batch),
-                    make_output_param(out),
-                    make_scalar_param(is_first),
-                    make_scalar_param(is_last),
-                    make_scalar_param(batch),
-                    make_scalar_param(q_offset),
-                    make_scalar_param(num_heads),
+                    make_output_param(li_batch),
+                    make_output_param(mi_batch),
                 };
-                pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 12);
+                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_hub, 3);
+
+                for (uint64_t bn = 0; bn < max_bn; bn++) {
+                    uint64_t sij_shapes[2] = {chunk_bc * q_tile, block_size};
+                    uint64_t vec_shapes[1] = {chunk_bc * q_tile};
+                    uint64_t oi_new_shapes[2] = {chunk_bc * q_tile, head_dim};
+
+                    Tensor sij_b = make_tensor(sij_shapes, 2, DataType::FLOAT32);
+                    Tensor pij_b = make_tensor(sij_shapes, 2, data_type);
+                    Tensor mij_b = make_tensor(vec_shapes, 1, DataType::FLOAT32);
+                    Tensor lij_b = make_tensor(vec_shapes, 1, DataType::FLOAT32);
+                    Tensor oi_new_b = make_tensor(oi_new_shapes, 2, DataType::FLOAT32);
+
+                    PTOParam params_qk[] = {
+                        make_input_param(query),
+                        make_input_param(key_cache),
+                        make_output_param(sij_b),
+                        make_scalar_param(bt_addr),
+                        make_scalar_param(chunk_bc),
+                        make_scalar_param(bn),
+                        make_scalar_param(q_offset),
+                        make_scalar_param(block_num),
+                        make_scalar_param(num_heads),
+                        make_scalar_param(batch_start),
+                    };
+                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 10);
+
+                    PTOParam params_sf[] = {
+                        make_input_param(sij_b),
+                        make_output_param(pij_b),
+                        make_output_param(mij_b),
+                        make_output_param(lij_b),
+                        make_scalar_param(float_to_u64(scale_value)),
+                        make_scalar_param(cl_addr),
+                        make_scalar_param(chunk_bc),
+                        make_scalar_param(bn),
+                        make_scalar_param(batch_start),
+                    };
+                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 9);
+
+                    PTOParam params_pv[] = {
+                        make_input_param(pij_b),
+                        make_input_param(value_cache),
+                        make_output_param(oi_new_b),
+                        make_scalar_param(bt_addr),
+                        make_scalar_param(chunk_bc),
+                        make_scalar_param(bn),
+                        make_scalar_param(block_num),
+                        make_scalar_param(batch_start),
+                    };
+                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 8);
+
+                    uint64_t is_first = (bn == 0) ? 1 : 0;
+                    uint64_t is_last = (bn == max_bn - 1) ? 1 : 0;
+                    PTOParam params_up[] = {
+                        make_input_param(mij_b),
+                        make_input_param(lij_b),
+                        make_input_param(oi_new_b),
+                        make_inout_param(mi_batch),
+                        make_inout_param(li_batch),
+                        make_output_param(oi_batch),
+                        make_output_param(out),
+                        make_scalar_param(is_first),
+                        make_scalar_param(is_last),
+                        make_scalar_param(chunk_bc),
+                        make_scalar_param(q_offset),
+                        make_scalar_param(num_heads),
+                        make_scalar_param(batch_start),
+                    };
+                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 13);
+                }
             }
         }
     }
 
-    LOG_INFO(rt, "batch_paged_attention: %lu tasks (batch=%lu, max_bn=%lu)",
-             (unsigned long)(1 + max_bn * 4), (unsigned long)batch, (unsigned long)max_bn);
+    LOG_INFO(rt, "batch_paged_attention: %lu tasks (batch=%lu, max_bn=%lu, chunks=%lu, IN_CORE_BATCH=%lu)",
+             (unsigned long)(num_chunks * (1 + max_bn * 4)),
+             (unsigned long)batch, (unsigned long)max_bn,
+             (unsigned long)num_chunks, (unsigned long)IN_CORE_BATCH);
 }
 
 }  // extern "C"
diff --git a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 94fc473ba..9b1afe46e 100644
--- a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -61,8 +61,9 @@ constexpr int MAX_CORES_PER_THREAD = MAX_AIC_PER_THREAD + MAX_AIV_PER_THREAD;
 // Maximum tasks for ready queue (PTO2 mode uses shared memory task count)
 constexpr int AICPU_MAX_READY_TASKS = 16384;
 constexpr int AICPU_READY_MASK = AICPU_MAX_READY_TASKS - 1;
-// 3 shards per type: each scheduler thread pushes to its own shard (thread_idx % 3), pops own first + work stealing
-constexpr int PTO2_READY_QUEUE_SHARDS = 3;
+// Max shards per type: each scheduler thread pushes to its own shard (thread_idx % N), pops own first + work stealing
+// Runtime-configurable via env var PTO2_READY_QUEUE_SHARDS (1..MAX). Default=3.
+constexpr int PTO2_MAX_READY_QUEUE_SHARDS = 16;
 
 // Lightweight spinlock (avoids futex syscall overhead of std::mutex)
 struct SpinLock {
@@ -97,16 +98,18 @@ struct AicpuExecutor {
     int aic_count_{0};
     int aiv_count_{0};
 
-    // ===== 3 shards per type: push to own shard (thread_idx % 3), pop own first + work stealing =====
-    SpinLock ready_queue_aic_lock_[PTO2_READY_QUEUE_SHARDS];
-    int ready_queue_aic_[PTO2_READY_QUEUE_SHARDS][AICPU_MAX_READY_TASKS];
-    int ready_queue_aic_head_[PTO2_READY_QUEUE_SHARDS]{0};
-    int ready_queue_aic_tail_[PTO2_READY_QUEUE_SHARDS]{0};
+    // ===== N shards per type: push to own shard (thread_idx % N), pop own first + work stealing =====
+    // active_shards_ is set at runtime (1..PTO2_MAX_READY_QUEUE_SHARDS) via env PTO2_READY_QUEUE_SHARDS
+    int active_shards_{PTO2_MAX_READY_QUEUE_SHARDS};
+    SpinLock ready_queue_aic_lock_[PTO2_MAX_READY_QUEUE_SHARDS];
+    int ready_queue_aic_[PTO2_MAX_READY_QUEUE_SHARDS][AICPU_MAX_READY_TASKS];
+    int ready_queue_aic_head_[PTO2_MAX_READY_QUEUE_SHARDS]{0};
+    int ready_queue_aic_tail_[PTO2_MAX_READY_QUEUE_SHARDS]{0};
 
-    SpinLock ready_queue_aiv_lock_[PTO2_READY_QUEUE_SHARDS];
-    int ready_queue_aiv_[PTO2_READY_QUEUE_SHARDS][AICPU_MAX_READY_TASKS];
-    int ready_queue_aiv_head_[PTO2_READY_QUEUE_SHARDS]{0};
-    int ready_queue_aiv_tail_[PTO2_READY_QUEUE_SHARDS]{0};
+    SpinLock ready_queue_aiv_lock_[PTO2_MAX_READY_QUEUE_SHARDS];
+    int ready_queue_aiv_[PTO2_MAX_READY_QUEUE_SHARDS][AICPU_MAX_READY_TASKS];
+    int ready_queue_aiv_head_[PTO2_MAX_READY_QUEUE_SHARDS]{0};
+    int ready_queue_aiv_tail_[PTO2_MAX_READY_QUEUE_SHARDS]{0};
 
     // Task execution tracking
     std::atomic<int> completed_tasks_{0};
@@ -302,8 +305,19 @@ int AicpuExecutor::init(Runtime* runtime) {
     DEV_INFO("Init: orch_built_on_host=%d", orch_on_host ? 1 : 0);
     orchestrator_done_.store(orch_on_host, std::memory_order_release);
 
+    // Read ready queue shard count from Runtime (set by host via env PTO2_READY_QUEUE_SHARDS)
+    {
+        int val = runtime->ready_queue_shards;
+        if (val >= 1 && val <= PTO2_MAX_READY_QUEUE_SHARDS) {
+            active_shards_ = val;
+        } else {
+            active_shards_ = PTO2_MAX_READY_QUEUE_SHARDS;
+        }
+        DEV_ALWAYS("Ready queue shards: %d (max=%d)", active_shards_, PTO2_MAX_READY_QUEUE_SHARDS);
+    }
+
     // Initial ready tasks will be populated from PTO2 shared memory in resolve_and_dispatch_pto2
-    for (int s = 0; s < PTO2_READY_QUEUE_SHARDS; s++) {
+    for (int s = 0; s < PTO2_MAX_READY_QUEUE_SHARDS; s++) {
         ready_queue_aic_head_[s] = 0;
         ready_queue_aic_tail_[s] = 0;
         ready_queue_aiv_head_[s] = 0;
@@ -568,7 +582,7 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                     if (prev + 1 == fanin_count) {
                         __atomic_store_n(&s_pto2_task_completed[consumer_slot], 1, __ATOMIC_RELEASE);
                         int32_t wt = consumer_desc->worker_type;
-                        int my_shard = thread_idx % PTO2_READY_QUEUE_SHARDS;
+                        int my_shard = thread_idx % active_shards_;
 #if PTO2_ORCH_PROFILING
                         uint64_t _l0 = get_sys_cnt_aicpu(), _l1, _l2;
 #endif
@@ -625,10 +639,10 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                     int this_pop_steal = -1;
                     uint64_t _l0 = get_sys_cnt_aicpu(), _l1 = _l0, _l2 = _l0;
 #endif
-                    int my_shard = thread_idx % PTO2_READY_QUEUE_SHARDS;
+                    int my_shard = thread_idx % active_shards_;
                     if (h->core_type == CoreType::AIC) {
-                        for (int k = 0; k < PTO2_READY_QUEUE_SHARDS && task_id < 0; k++) {
-                            int shard = (my_shard + k) % PTO2_READY_QUEUE_SHARDS;
+                        for (int k = 0; k < active_shards_ && task_id < 0; k++) {
+                            int shard = (my_shard + k) % active_shards_;
                             ready_queue_aic_lock_[shard].lock();
                             if (ready_queue_aic_head_[shard] < ready_queue_aic_tail_[shard]) {
 #if PTO2_ORCH_PROFILING
@@ -645,8 +659,8 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                             ready_queue_aic_lock_[shard].unlock();
                         }
                     } else {
-                        for (int k = 0; k < PTO2_READY_QUEUE_SHARDS && task_id < 0; k++) {
-                            int shard = (my_shard + k) % PTO2_READY_QUEUE_SHARDS;
+                        for (int k = 0; k < active_shards_ && task_id < 0; k++) {
+                            int shard = (my_shard + k) % active_shards_;
                             ready_queue_aiv_lock_[shard].lock();
                             if (ready_queue_aiv_head_[shard] < ready_queue_aiv_tail_[shard]) {
 #if PTO2_ORCH_PROFILING
@@ -754,7 +768,7 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                     // Mark as enqueued (state=1) to prevent double-enqueue
                     __atomic_store_n(&s_pto2_task_completed[slot], 1, __ATOMIC_RELEASE);
                     int32_t wt = t->worker_type;
-                    int my_shard = thread_idx % PTO2_READY_QUEUE_SHARDS;
+                    int my_shard = thread_idx % active_shards_;
 #if PTO2_ORCH_PROFILING
                     uint64_t _l0 = get_sys_cnt_aicpu(), _l1, _l2;
 #endif
@@ -806,7 +820,7 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
 
                 PTO2TaskDescriptor* t = &task_descriptors[slot];
                 int32_t wt = t->worker_type;
-                int my_shard = thread_idx % PTO2_READY_QUEUE_SHARDS;
+                int my_shard = thread_idx % active_shards_;
 #if PTO2_ORCH_PROFILING
                 uint64_t _l0 = get_sys_cnt_aicpu(), _l1, _l2;
 #endif
@@ -1221,8 +1235,8 @@ int AicpuExecutor::run(Runtime* runtime) {
 }
 
 void AicpuExecutor::deinit() {
-    // Cleanup runtime execution state
-    for (int s = 0; s < PTO2_READY_QUEUE_SHARDS; s++) {
+    // Cleanup runtime execution state (clear all max slots for safety)
+    for (int s = 0; s < PTO2_MAX_READY_QUEUE_SHARDS; s++) {
         ready_queue_aic_head_[s] = 0;
         ready_queue_aic_tail_[s] = 0;
         ready_queue_aiv_head_[s] = 0;
@@ -1271,11 +1285,11 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int thread_idx,
              completed, total, total > 0 ? completed * 100.0 / total : 0.0);
 
     int aic_ready = 0, aiv_ready = 0;
-    for (int s = 0; s < PTO2_READY_QUEUE_SHARDS; s++) {
+    for (int s = 0; s < active_shards_; s++) {
         aic_ready += ready_queue_aic_tail_[s] - ready_queue_aic_head_[s];
         aiv_ready += ready_queue_aiv_tail_[s] - ready_queue_aiv_head_[s];
     }
-    DEV_ALWAYS("Ready Queues (3 shards, per-thread push + work-steal pop): AIC=%d, AIV=%d", aic_ready, aiv_ready);
+    DEV_ALWAYS("Ready Queues (%d shards, per-thread push + work-steal pop): AIC=%d, AIV=%d", active_shards_, aic_ready, aiv_ready);
 
     int busy_cores = 0;
     int idle_cores = 0;
diff --git a/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 3a5493fe7..9e115501c 100644
--- a/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -22,6 +22,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <sys/time.h>
@@ -243,6 +244,22 @@ extern "C" int init_runtime_impl(Runtime *runtime,
     runtime->set_pto2_gm_sm_ptr(sm_ptr);
     runtime->record_tensor_pair(nullptr, sm_ptr, static_cast<size_t>(sm_size));
 
+    // Read ready queue shard count from environment for AICPU scheduler
+    {
+        const char* env_shards = std::getenv("PTO2_READY_QUEUE_SHARDS");
+        if (env_shards) {
+            int val = atoi(env_shards);
+            if (val >= 1 && val <= 16) {
+                runtime->ready_queue_shards = val;
+            } else {
+                std::cerr << "PTO2_READY_QUEUE_SHARDS=" << env_shards
+                          << " out of range [1,16], using default 3\n";
+                runtime->ready_queue_shards = 3;
+            }
+        }
+        std::cout << "Ready queue shards: " << runtime->ready_queue_shards << "\n";
+    }
+
     // Set up device orchestration state
     runtime->set_orch_built_on_host(false);
     runtime->set_orch_args(device_args, func_args_count);
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
index a3b7c5bf5..80734d6eb 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
@@ -20,6 +20,7 @@ Runtime::Runtime() {
     memset(workers, 0, sizeof(workers));
     worker_count = 0;
     sche_cpu_num = 1;
+    ready_queue_shards = 3;
 
     // Initialize tensor pairs
     tensor_pair_count = 0;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 7c1d0a67a..5ce7bed0e 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -137,6 +137,7 @@ class Runtime {
 
     // Execution parameters for AICPU scheduling
     int sche_cpu_num;  // Number of AICPU threads for scheduling
+    int ready_queue_shards;  // Number of ready queue shards per core type (1..3, default 3)
 
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     // NOTE: Made public for direct access from aicore code

From ae644e78382b10914c46bc33f951328b16e94e9c Mon Sep 17 00:00:00 2001
From: liaoheng <hengliao1972@163.com>
Date: Thu, 26 Feb 2026 17:38:23 +0800
Subject: [PATCH 6/6] Feature: ring buffer flow control and configurable task
 window

- Add last_task_alive advancement in scheduler completion handler with
  lock-free CAS to reclaim ring buffer slots and enable back-pressure
  flow control for small task windows
- Add completed_by_task tracking array to prevent stale completion state
  from recycled slots from corrupting the early-return dependency path
- Reset completed/completed_by_task in orchestrator at slot allocation
  time (safe after fanout protocol completes) so scanner CAS(0->1) works
  for root tasks at recycled slots
- Add orch_pointers_ready_ synchronization flag to ensure scheduler
  threads wait for Thread 3 to finish configuring shared memory pointers
  before entering the scheduling loop
- Support configurable ring buffer sizes via environment variables:
  PTO2_RING_TASK_WINDOW, PTO2_RING_HEAP, PTO2_RING_DEP_POOL
- Add generate_full_swimlane.py tool for Perfetto visualization with
  dedicated lanes for orchestrator, scheduler threads, and per-core
  AIC/AIV execution
---
 .../aicpu/aicpu_executor.cpp                  |  72 +++++
 .../host/runtime_maker.cpp                    |  41 +++
 .../runtime/pto_orchestrator.cpp              |  18 +-
 .../runtime/pto_orchestrator.h                |   1 +
 .../runtime/runtime.cpp                       |   3 +
 .../runtime/runtime.h                         |   5 +
 tools/generate_full_swimlane.py               | 255 ++++++++++++++++++
 7 files changed, 392 insertions(+), 3 deletions(-)
 create mode 100644 tools/generate_full_swimlane.py

diff --git a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 9b1afe46e..d791826b7 100644
--- a/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -121,6 +121,7 @@ struct AicpuExecutor {
     std::atomic<bool> pto2_init_complete_{false};  // init block finished; others wait for this
     std::atomic<int> next_scan_index_{0};
     std::atomic<bool> sm_header_ready_{false};  // Thread 3 sets after SM header init
+    std::atomic<bool> orch_pointers_ready_{false};  // Thread 3 sets after aicpu parallel mode pointers + orch_ready_queue are configured
 
     // Orchestrator ready queue pointers (set by Thread 3, read by scheduler threads)
     volatile int32_t* orch_ready_queue_{nullptr};
@@ -154,6 +155,7 @@ static AicpuExecutor g_aicpu_executor;
 static constexpr int PTO2_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE;
 static int s_pto2_fanin_refcount[PTO2_MAX_SLOTS];
 static volatile int32_t s_pto2_task_completed[PTO2_MAX_SLOTS];
+static int32_t s_pto2_completed_by_task[PTO2_MAX_SLOTS];  // task_id that set completed state (for slot-reuse validation)
 static PTO2DispatchPayload s_pto2_payload_per_core[RUNTIME_MAX_WORKER];
 
 // ===== AicpuExecutor Method Implementations =====
@@ -428,6 +430,7 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
         DEV_INFO("Thread %d: doing one-time init", thread_idx);
         std::memset(s_pto2_fanin_refcount, 0, sizeof(s_pto2_fanin_refcount));
         std::memset((void*)s_pto2_task_completed, 0, sizeof(s_pto2_task_completed));
+        std::memset(s_pto2_completed_by_task, -1, sizeof(s_pto2_completed_by_task));
 
         // Assign perf buffers to cores early so profiling captures all tasks
         // (total_tasks written to header later when orchestrator completes)
@@ -443,6 +446,14 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
         }
     }
 
+    // Wait for Thread 3 to finish setting up aicpu parallel mode pointers
+    // and orch_ready_queue before entering the scheduling loop.
+    if (thread_num_ == 4 && !runtime->get_orch_built_on_host()) {
+        while (!orch_pointers_ready_.load(std::memory_order_acquire)) {
+            std::this_thread::yield();
+        }
+    }
+
     DEV_INFO("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_num);
     int cur_thread_completed = 0;
     int cur_thread_tasks_in_flight = 0;
@@ -547,6 +558,7 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                 //     via the release/acquire pair and takes the early-return path, directly
                 //     incrementing X's fanin_refcount instead of touching fanout_head.
                 // Either way every consumer is accounted for exactly once.
+                __atomic_store_n(&s_pto2_completed_by_task[task_id & window_mask], task_id, __ATOMIC_RELEASE);
                 __atomic_store_n(&s_pto2_task_completed[task_id & window_mask], 2, __ATOMIC_RELEASE);
                 pto2_fanout_lock(pto2_task);
                 int32_t fanout_head = (int32_t)pto2_task->fanout_head;
@@ -616,6 +628,48 @@ int AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int thread_idx,
                 cur_thread_completed++;
                 made_progress = true;
                 completed_tasks_.fetch_add(1, std::memory_order_release);
+
+                // Advance last_task_alive for TaskRing flow control.
+                // Mark this task as fully consumed (state=3), then try to
+                // advance the watermark using lock-free CAS.
+                //
+                // ORDERING: Reset completed/refcount BEFORE advancing last_task_alive.
+                // Once last_task_alive advances past a slot, the orchestrator can
+                // immediately reuse it. The early-return path in
+                // pto2_add_consumer_to_producer checks aicpu_task_completed[prod_slot];
+                // if we reset AFTER the CAS, the orchestrator could see stale state=3
+                // from the old task and incorrectly skip dependency setup.
+                __atomic_store_n(&s_pto2_task_completed[task_id & window_mask], 3, __ATOMIC_RELEASE);
+                {
+                    int32_t la = __atomic_load_n(&header->last_task_alive, __ATOMIC_ACQUIRE);
+                    int32_t cti = __atomic_load_n(&header->current_task_index, __ATOMIC_ACQUIRE);
+                    while (la < cti) {
+                        int32_t la_slot = la & window_mask;
+                        if (__atomic_load_n(&s_pto2_task_completed[la_slot], __ATOMIC_ACQUIRE) < 3)
+                            break;
+                        // Only reset refcount — the orchestrator's early-return path
+                        // (pto2_add_consumer_to_producer) MUST see completed >= 2 when
+                        // the producer has actually finished, per the fanout lock protocol.
+                        // completed_by_task guards against stale state from recycled slots:
+                        // the old task's completed_by_task won't match the new producer_id.
+                        __atomic_store_n(&s_pto2_fanin_refcount[la_slot], 0, __ATOMIC_RELEASE);
+                        // Advance last_task_alive to make this slot available.
+                        int32_t expected = la;
+                        if (__atomic_compare_exchange_n(&header->last_task_alive, &expected, la + 1,
+                                false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) {
+                            // Advance heap_tail for HeapRing flow control
+                            PTO2TaskDescriptor* consumed_t = &task_descriptors[la_slot];
+                            if (consumed_t->packed_buffer_end != nullptr) {
+                                int32_t new_tail = (int32_t)(intptr_t)consumed_t->packed_buffer_end;
+                                __atomic_store_n(&header->heap_tail, new_tail, __ATOMIC_RELEASE);
+                            }
+                            la = la + 1;
+                        } else {
+                            break;
+                        }
+                    }
+                }
+
                 // Debug: periodic progress (thread 0 only) to find which task hangs
                 if (thread_idx == 0 && task_count > 0) {
                     int32_t c = completed_tasks_.load(std::memory_order_acquire);
@@ -1105,6 +1159,19 @@ int AicpuExecutor::run(Runtime* runtime) {
                 DEV_INFO("Thread 3: No config function, using defaults");
             }
 
+            // Apply ring buffer size overrides from Runtime (set by host env vars)
+            if (runtime->pto2_task_window_size > 0) {
+                task_window_size = runtime->pto2_task_window_size;
+            }
+            if (runtime->pto2_heap_size > 0) {
+                heap_size = runtime->pto2_heap_size;
+            }
+            if (runtime->pto2_dep_list_pool_size > 0) {
+                dep_list_pool_size = runtime->pto2_dep_list_pool_size;
+            }
+            DEV_INFO("Thread 3: Ring sizes: task_window=%d, heap=%d, dep_pool=%d",
+                     task_window_size, heap_size, dep_list_pool_size);
+
             if (expected_arg_count > 0 && arg_count < expected_arg_count) {
                 DEV_ERROR("Thread 3: arg_count %d < expected %d", arg_count, expected_arg_count);
                 dlclose(handle);
@@ -1152,6 +1219,7 @@ int AicpuExecutor::run(Runtime* runtime) {
             if (ws <= 0 || ws > PTO2_MAX_SLOTS) ws = PTO2_MAX_SLOTS;
             rt->orchestrator.aicpu_fanin_refcount = s_pto2_fanin_refcount;
             rt->orchestrator.aicpu_task_completed = s_pto2_task_completed;
+            rt->orchestrator.aicpu_completed_by_task = s_pto2_completed_by_task;
             rt->orchestrator.aicpu_window_mask = ws - 1;
 
             // Expose orchestrator ready queue to scheduler threads
@@ -1160,6 +1228,9 @@ int AicpuExecutor::run(Runtime* runtime) {
             orch_ready_head_ = &rt->orchestrator.orch_ready_head;
             orch_ready_capacity_ = PTO2OrchestratorState::ORCH_READY_QUEUE_SIZE;
 
+            // Signal scheduler threads: all pointers are ready, safe to start scheduling.
+            orch_pointers_ready_.store(true, std::memory_order_release);
+
             // Call orchestration wrapped in outer scope (matches old PTO2_ORCHESTRATION behavior)
             DEV_ALWAYS("Thread 3: Calling aicpu_orchestration_entry from SO");
             uint64_t orch_cycle_start = get_sys_cnt_aicpu();
@@ -1257,6 +1328,7 @@ void AicpuExecutor::deinit() {
     pto2_init_complete_.store(false, std::memory_order_release);
     next_scan_index_.store(0, std::memory_order_release);
     sm_header_ready_.store(false, std::memory_order_release);
+    orch_pointers_ready_.store(false, std::memory_order_release);
 
     // Reset core discovery state
     aic_count_ = 0;
diff --git a/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 9e115501c..d2d70b83b 100644
--- a/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -260,6 +260,47 @@ extern "C" int init_runtime_impl(Runtime *runtime,
         std::cout << "Ready queue shards: " << runtime->ready_queue_shards << "\n";
     }
 
+    // Read ring buffer size overrides from environment
+    {
+        const char* env_tw = std::getenv("PTO2_RING_TASK_WINDOW");
+        if (env_tw) {
+            int val = atoi(env_tw);
+            if (val >= 4 && (val & (val - 1)) == 0) {
+                runtime->pto2_task_window_size = val;
+            } else {
+                std::cerr << "PTO2_RING_TASK_WINDOW=" << env_tw
+                          << " invalid (must be power of 2, >= 4), ignored\n";
+            }
+        }
+        const char* env_hs = std::getenv("PTO2_RING_HEAP");
+        if (env_hs) {
+            int val = atoi(env_hs);
+            if (val >= 1024) {
+                runtime->pto2_heap_size = val;
+            } else {
+                std::cerr << "PTO2_RING_HEAP=" << env_hs
+                          << " too small (min 1024), ignored\n";
+            }
+        }
+        const char* env_dp = std::getenv("PTO2_RING_DEP_POOL");
+        if (env_dp) {
+            int val = atoi(env_dp);
+            if (val >= 16) {
+                runtime->pto2_dep_list_pool_size = val;
+            } else {
+                std::cerr << "PTO2_RING_DEP_POOL=" << env_dp
+                          << " too small (min 16), ignored\n";
+            }
+        }
+        if (runtime->pto2_task_window_size || runtime->pto2_heap_size || runtime->pto2_dep_list_pool_size) {
+            std::cout << "Ring buffer overrides:"
+                      << " task_window=" << (runtime->pto2_task_window_size ? runtime->pto2_task_window_size : PTO2_TASK_WINDOW_SIZE)
+                      << " heap=" << (runtime->pto2_heap_size ? runtime->pto2_heap_size : PTO2_HEAP_SIZE)
+                      << " dep_pool=" << (runtime->pto2_dep_list_pool_size ? runtime->pto2_dep_list_pool_size : PTO2_DEP_LIST_POOL_SIZE)
+                      << "\n";
+        }
+    }
+
     // Set up device orchestration state
     runtime->set_orch_built_on_host(false);
     runtime->set_orch_args(device_args, func_args_count);
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 0fa867ff7..a4d5858ca 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -193,11 +193,14 @@ void pto2_add_consumer_to_producer(
     // This synchronizes with scheduler's on_task_complete_threadsafe
     task_fanout_lock(producer);
 
-    // AICPU parallel mode: check if producer already completed before adding to fanout
+    // AICPU parallel mode: check if producer already completed before adding to fanout.
+    // Read completed FIRST (ACQUIRE) to establish happens-before with the scheduler's
+    // RELEASE stores (completed_by_task is stored before completed in program order).
+    // Then check completed_by_task to guard against stale state from recycled slots.
     if (orch->aicpu_task_completed) {
         int32_t prod_slot = producer_id & orch->aicpu_window_mask;
-        if (__atomic_load_n(&orch->aicpu_task_completed[prod_slot], __ATOMIC_ACQUIRE) >= 2) {
-            // Producer already completed, directly increment consumer's refcount
+        if (__atomic_load_n(&orch->aicpu_task_completed[prod_slot], __ATOMIC_ACQUIRE) >= 2 &&
+            __atomic_load_n(&orch->aicpu_completed_by_task[prod_slot], __ATOMIC_RELAXED) == producer_id) {
             int32_t cons_slot = consumer_id & orch->aicpu_window_mask;
             __atomic_fetch_add(&orch->aicpu_fanin_refcount[cons_slot], 1, __ATOMIC_ACQ_REL);
             task_fanout_unlock(producer);
@@ -263,6 +266,15 @@ void pto2_submit_task(PTO2OrchestratorState* orch,
 
     PTO2TaskDescriptor* task = pto2_task_ring_get(&orch->task_ring, task_id);
 
+    // Reset scheduler-side slot state for reuse.  The old task's fanout/lock
+    // protocol is fully complete by the time last_task_alive advances past it,
+    // so resetting here (after allocation) is safe.
+    if (orch->aicpu_task_completed) {
+        int32_t slot = task_id & orch->aicpu_window_mask;
+        __atomic_store_n(&orch->aicpu_task_completed[slot], 0, __ATOMIC_RELEASE);
+        __atomic_store_n(&orch->aicpu_completed_by_task[slot], -1, __ATOMIC_RELEASE);
+    }
+
     // Initialize task descriptor
     task->task_id = task_id;
     task->kernel_id = kernel_id;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 288732ea2..a3a99b35f 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -76,6 +76,7 @@ struct PTO2OrchestratorState {
     // === AICPU PARALLEL MODE (set by aicpu_executor, NULL when unused) ===
     int32_t* aicpu_fanin_refcount;
     volatile int32_t* aicpu_task_completed;
+    int32_t* aicpu_completed_by_task;  // task_id that set the completed state (for slot-reuse validation)
     int32_t aicpu_window_mask;
 
     // === ORCHESTRATOR READY QUEUE (early-return path → scheduler) ===
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
index 80734d6eb..0149c2f1a 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
@@ -21,6 +21,9 @@ Runtime::Runtime() {
     worker_count = 0;
     sche_cpu_num = 1;
     ready_queue_shards = 3;
+    pto2_task_window_size = 0;
+    pto2_heap_size = 0;
+    pto2_dep_list_pool_size = 0;
 
     // Initialize tensor pairs
     tensor_pair_count = 0;
diff --git a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 5ce7bed0e..e27668543 100644
--- a/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -139,6 +139,11 @@ class Runtime {
     int sche_cpu_num;  // Number of AICPU threads for scheduling
     int ready_queue_shards;  // Number of ready queue shards per core type (1..3, default 3)
 
+    // Ring buffer size overrides (0 = use compile-time defaults)
+    int pto2_task_window_size;
+    int pto2_heap_size;
+    int pto2_dep_list_pool_size;
+
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     // NOTE: Made public for direct access from aicore code
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
diff --git a/tools/generate_full_swimlane.py b/tools/generate_full_swimlane.py
new file mode 100644
index 000000000..39fe296f2
--- /dev/null
+++ b/tools/generate_full_swimlane.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+Generate Perfetto swimlane JSON with dedicated lanes for:
+  - Orchestrator (1 lane)
+  - 3 Scheduler threads (3 lanes)
+  - Each AIV core (individual lanes)
+  - Each AIC core (individual lanes)
+
+Usage:
+    python3 tools/generate_full_swimlane.py outputs/perf_swimlane_XXXX.json
+    python3 tools/generate_full_swimlane.py outputs/perf_swimlane_XXXX.json -o outputs/full_swimlane.json
+"""
+
+import json
+import sys
+import argparse
+from pathlib import Path
+from collections import defaultdict
+
+
+FUNC_ID_TO_NAME = {0: "QK", 1: "SF", 2: "PV", 3: "UP", 4: "AIC_HUB", 5: "AIV_HUB"}
+
+PID_ORCHESTRATOR = 1
+PID_SCHEDULER = 2
+PID_AIC = 3
+PID_AIV = 4
+
+
+def assign_cores_to_threads(aic_ids, aiv_ids, num_threads=3):
+    """Reproduce the C++ assign_cores_to_threads logic."""
+    aic_sorted = sorted(aic_ids)
+    aiv_sorted = sorted(aiv_ids)
+    aic_per = len(aic_sorted) // num_threads
+    aiv_per = len(aiv_sorted) // num_threads
+
+    core_to_thread = {}
+    for t in range(num_threads):
+        for c in aic_sorted[t * aic_per:(t + 1) * aic_per]:
+            core_to_thread[c] = t
+        for c in aiv_sorted[t * aiv_per:(t + 1) * aiv_per]:
+            core_to_thread[c] = t
+    # Remainder cores go to last thread
+    for c in aic_sorted[num_threads * aic_per:]:
+        core_to_thread[c] = num_threads - 1
+    for c in aiv_sorted[num_threads * aiv_per:]:
+        core_to_thread[c] = num_threads - 1
+    return core_to_thread
+
+
+def generate_full_swimlane(tasks, output_path):
+    events = []
+
+    # Classify cores
+    aic_ids = sorted({t["core_id"] for t in tasks if t["core_type"] == "aic"})
+    aiv_ids = sorted({t["core_id"] for t in tasks if t["core_type"] == "aiv"})
+    core_to_thread = assign_cores_to_threads(aic_ids, aiv_ids)
+
+    # ── Process metadata ──
+    for pid, name in [
+        (PID_ORCHESTRATOR, "Orchestrator (AICPU Thread 3)"),
+        (PID_SCHEDULER, "Scheduler Threads (AICPU 0-2)"),
+        (PID_AIC, "AIC Cores"),
+        (PID_AIV, "AIV Cores"),
+    ]:
+        events.append({"args": {"name": name}, "cat": "__metadata",
+                        "name": "process_name", "ph": "M", "pid": pid})
+
+    # ── Thread metadata ──
+    # Orchestrator: single lane
+    events.append({"args": {"name": "Orchestrator"}, "cat": "__metadata",
+                    "name": "thread_name", "ph": "M", "pid": PID_ORCHESTRATOR, "tid": 0})
+
+    # Scheduler: 3 lanes
+    for t in range(3):
+        events.append({"args": {"name": f"Scheduler {t}"}, "cat": "__metadata",
+                        "name": "thread_name", "ph": "M", "pid": PID_SCHEDULER, "tid": t})
+
+    # AIC cores
+    for idx, cid in enumerate(aic_ids):
+        events.append({"args": {"name": f"AIC_{cid}"}, "cat": "__metadata",
+                        "name": "thread_name", "ph": "M", "pid": PID_AIC, "tid": cid})
+
+    # AIV cores
+    for idx, cid in enumerate(aiv_ids):
+        events.append({"args": {"name": f"AIV_{cid}"}, "cat": "__metadata",
+                        "name": "thread_name", "ph": "M", "pid": PID_AIV, "tid": cid})
+
+    # Sort tasks by task_id for orchestrator ordering
+    tasks_by_id = sorted(tasks, key=lambda t: t["task_id"])
+
+    # Build task map for flow events
+    task_map = {t["task_id"]: t for t in tasks}
+
+    # ── Orchestrator lane ──
+    # Estimate submission time: the orchestrator submits tasks sequentially.
+    # Approximate submit_start as slightly before the earliest of:
+    #   dispatch_time of this task or the previous task's submit_end.
+    # For the first task, use dispatch_time - small_delta.
+    orch_events = []
+    prev_submit_end = 0
+    min_dispatch = min(t.get("dispatch_time_us", 1e9) for t in tasks if t.get("dispatch_time_us", 0) > 0)
+
+    for task in tasks_by_id:
+        tid = task["task_id"]
+        func_name = FUNC_ID_TO_NAME.get(task["func_id"], f"F{task['func_id']}")
+        disp = task.get("dispatch_time_us", 0)
+
+        # Heuristic: orchestrator submit window ≈ 9.5us/task (from orch profiling avg)
+        orch_dur = 5.0  # estimated us per submit
+        if prev_submit_end == 0:
+            submit_start = min_dispatch - 50  # first task: 50us before first dispatch
+        else:
+            submit_start = prev_submit_end + 0.1
+
+        submit_end = submit_start + orch_dur
+        prev_submit_end = submit_end
+
+        events.append({
+            "name": f"{func_name}({tid})",
+            "cat": "orchestrator",
+            "ph": "X",
+            "pid": PID_ORCHESTRATOR,
+            "tid": 0,
+            "ts": submit_start,
+            "dur": orch_dur,
+            "args": {"task_id": tid, "func": func_name, "core_id": task["core_id"]}
+        })
+
+    # ── Scheduler lanes ──
+    # Group tasks by scheduler thread (heuristic: core ownership)
+    for task in tasks:
+        disp = task.get("dispatch_time_us", 0)
+        fin = task.get("finish_time_us", 0)
+        if disp <= 0 or fin <= 0:
+            continue
+
+        core_id = task["core_id"]
+        sched_tid = core_to_thread.get(core_id, 0)
+        func_name = FUNC_ID_TO_NAME.get(task["func_id"], f"F{task['func_id']}")
+        task_id = task["task_id"]
+
+        events.append({
+            "name": f"{func_name}({task_id})",
+            "cat": "scheduler",
+            "ph": "X",
+            "pid": PID_SCHEDULER,
+            "tid": sched_tid,
+            "ts": disp,
+            "dur": fin - disp,
+            "args": {
+                "task_id": task_id,
+                "core_id": core_id,
+                "dispatch_us": disp,
+                "finish_us": fin,
+                "head_oh": task["start_time_us"] - disp,
+                "exec": task["duration_us"],
+                "tail_oh": fin - task["end_time_us"],
+            }
+        })
+
+    # ── AIC / AIV core lanes ──
+    event_id = 0
+    task_to_eid = {}
+    for task in tasks:
+        core_id = task["core_id"]
+        core_type = task["core_type"]
+        pid = PID_AIC if core_type == "aic" else PID_AIV
+        func_name = FUNC_ID_TO_NAME.get(task["func_id"], f"F{task['func_id']}")
+        task_id = task["task_id"]
+        ts = task["start_time_us"]
+        dur = task["duration_us"]
+
+        events.append({
+            "name": f"{func_name}({task_id})",
+            "cat": "kernel",
+            "ph": "X",
+            "id": event_id,
+            "pid": pid,
+            "tid": core_id,
+            "ts": ts,
+            "dur": dur,
+            "args": {
+                "task_id": task_id,
+                "func_id": task["func_id"],
+                "core_id": core_id,
+                "duration_us": dur,
+            }
+        })
+        task_to_eid[task_id] = event_id
+        event_id += 1
+
+    # ── Flow events (dependencies between core lanes) ──
+    flow_id = 0
+    for task in tasks:
+        src_pid = PID_AIC if task["core_type"] == "aic" else PID_AIV
+        src_tid = task["core_id"]
+        src_ts_end = task["end_time_us"]
+
+        for succ_id in task.get("fanout", []):
+            succ = task_map.get(succ_id)
+            if not succ:
+                continue
+            dst_pid = PID_AIC if succ["core_type"] == "aic" else PID_AIV
+            dst_tid = succ["core_id"]
+            dst_ts = succ["start_time_us"]
+
+            events.append({"cat": "flow", "id": flow_id, "name": "dep",
+                           "ph": "s", "pid": src_pid, "tid": src_tid,
+                           "ts": src_ts_end - 0.01})
+            events.append({"cat": "flow", "id": flow_id, "name": "dep",
+                           "ph": "f", "pid": dst_pid, "tid": dst_tid,
+                           "ts": dst_ts, "bp": "e"})
+            flow_id += 1
+
+    with open(output_path, "w") as f:
+        json.dump({"traceEvents": events}, f, indent=2)
+
+    print(f"Swimlane written: {output_path}")
+    print(f"  Tasks: {len(tasks)}")
+    print(f"  AIC cores: {len(aic_ids)} ({aic_ids[0]}..{aic_ids[-1]})")
+    print(f"  AIV cores: {len(aiv_ids)} ({aiv_ids[0]}..{aiv_ids[-1]})")
+    print(f"  Events: {len(events)}")
+    print(f"\nOpen https://ui.perfetto.dev/ and load {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate full swimlane Perfetto JSON")
+    parser.add_argument("input", nargs="?", help="perf_swimlane_*.json file")
+    parser.add_argument("-o", "--output", help="Output path")
+    args = parser.parse_args()
+
+    if args.input is None:
+        outputs_dir = Path(__file__).parent.parent / "outputs"
+        candidates = sorted(outputs_dir.glob("perf_swimlane_*.json"), key=lambda p: p.stat().st_mtime)
+        if not candidates:
+            print("No perf_swimlane_*.json found in outputs/", file=sys.stderr)
+            return 1
+        input_path = candidates[-1]
+        print(f"Auto-selected: {input_path.name}")
+    else:
+        input_path = Path(args.input)
+
+    with open(input_path) as f:
+        data = json.load(f)
+
+    output_path = args.output or str(
+        input_path.parent / f"perfetto_full_swimlane_{input_path.stem.split('_', 2)[-1]}.json"
+    )
+
+    generate_full_swimlane(data["tasks"], output_path)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())