hw-native-sys
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py‎
Lines changed: 0 additions & 77 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py‎
Lines changed: 0 additions & 77 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp‎
Lines changed: 36 additions & 17 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp‎
Lines changed: 36 additions & 17 deletions
diff --git a/‎examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp‎
Lines changed: 42 additions & 17 deletions b/‎examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp‎
Lines changed: 42 additions & 17 deletions
@@ -14,10 +14,14 @@
 // Processes batch_count batches in a single kernel invocation.
 // Per-batch addresses are computed from global tensor bases + block_table lookup.
 //
-// Template: M=q_tile, K=block_size, N=head_dim
+// Supports three tile configurations via runtime dispatch:
+//   Small: (16,  16) @ ( 16,  16) -> (16,  16)  [fp16]
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)  [bf16]
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)  [bf16]
+//
+// Template: T=data_type, M=q_tile, K=block_size, N=head_dim
 
 #include <cstdint>
-// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
 #include <pto/pto-inst.hpp>
 
 #include "tensor.h"
@@ -33,25 +37,25 @@ using namespace pto;
 #define __aicore__ [aicore]  // NOLINT(whitespace/braces)
 #endif
 
-template <int M, int K, int N>
+template <typename T, int M, int K, int N>
 static __aicore__ void pv_matmul_batch_impl(
     __gm__ Tensor *pij_batch, __gm__ Tensor *value_cache, __gm__ Tensor *block_table_t, __gm__ Tensor *oi_new_batch,
     uint64_t batch_count, uint64_t block_idx, uint64_t block_num, uint64_t batch_start
 ) {
-    __gm__ half *pij_base = reinterpret_cast<__gm__ half *>(pij_batch->buffer.addr);
-    __gm__ half *val_base = reinterpret_cast<__gm__ half *>(value_cache->buffer.addr);
+    __gm__ T *pij_base = reinterpret_cast<__gm__ T *>(pij_batch->buffer.addr);
+    __gm__ T *val_base = reinterpret_cast<__gm__ T *>(value_cache->buffer.addr);
     __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new_batch->buffer.addr);
     __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
 
-    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalA = GlobalTensor<T, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<T, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
     using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
 
-    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+    using TileMatA = Tile<TileType::Mat, T, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, T, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
 
-    using LeftTile = TileLeft<half, M, K, M, K>;
-    using RightTile = TileRight<half, K, N, K, N>;
+    using LeftTile = TileLeft<T, M, K, M, K>;
+    using RightTile = TileRight<T, K, N, K, N>;
     using AccTile = TileAcc<float, M, N, M, N>;
 
     TileMatA aMatTile;
@@ -67,9 +71,9 @@ static __aicore__ void pv_matmul_batch_impl(
     TASSIGN(cTile, 0x0);
 
     for (uint64_t b = 0; b < batch_count; b++) {
-        __gm__ half *pij_addr = pij_base + b * M * K;
+        __gm__ T *pij_addr = pij_base + b * M * K;
         int32_t phys_block = bt[(batch_start + b) * block_num + block_idx];
-        __gm__ half *vj_addr = val_base + static_cast<uint64_t>(phys_block) * K * N;
+        __gm__ T *vj_addr = val_base + static_cast<uint64_t>(phys_block) * K * N;
         __gm__ float *oi_addr = oi_base + b * M * N;
 
         GlobalA pijGlobal(pij_addr);
@@ -99,6 +103,9 @@ static __aicore__ void pv_matmul_batch_impl(
             pipe_barrier(PIPE_ALL);
         }
     }
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
@@ -111,8 +118,20 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     uint64_t block_num = static_cast<uint64_t>(args[6]);
     uint64_t batch_start = static_cast<uint64_t>(args[7]);
 
-    pv_matmul_batch_impl<16, 16, 16>(
-        pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start
-    );
+    uint64_t q_tile_size = static_cast<uint64_t>(pij_batch->shapes[0] / batch_count);
+    uint64_t block_size = static_cast<uint64_t>(pij_batch->shapes[1]);
+
+    if (q_tile_size == 16 && block_size == 16) {
+        pv_matmul_batch_impl<half, 16, 16, 16>(
+            pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start
+        );
+    } else if (q_tile_size == 16) {
+        pv_matmul_batch_impl<bfloat16_t, 16, 128, 128>(
+            pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start
+        );
+    } else {
+        pv_matmul_batch_impl<bfloat16_t, 64, 64, 128>(
+            pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start
+        );
+    }
 }
-// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto)
 
@@ -14,7 +14,12 @@
 // Processes batch_count batches in a single kernel invocation.
 // Per-batch addresses are computed from global tensor bases + block_table lookup.
 //
-// Template: M=q_tile, K=head_dim, N=block_size
+// Supports three tile configurations via runtime dispatch:
+//   Small: (16,  16) @ ( 16,  16).T -> (16,  16)  [fp16]
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)  [bf16]
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)  [bf16]
+//
+// Template: T=data_type, M=q_tile, K=head_dim, N=block_size
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
@@ -32,26 +37,26 @@ using namespace pto;
 #define __aicore__ [aicore]  // NOLINT(whitespace/braces)
 #endif
 
-template <int M, int K, int N>
+template <typename T, int M, int K, int N>
 static __aicore__ void qk_matmul_batch_impl(
     __gm__ Tensor *query, __gm__ Tensor *key_cache, __gm__ Tensor *block_table_t, __gm__ Tensor *sij_batch,
     uint64_t batch_count, uint64_t block_idx, uint64_t q_offset, uint64_t block_num, uint64_t num_heads,
     uint64_t batch_start
 ) {
-    __gm__ half *query_base = reinterpret_cast<__gm__ half *>(query->buffer.addr);
-    __gm__ half *key_base = reinterpret_cast<__gm__ half *>(key_cache->buffer.addr);
+    __gm__ T *query_base = reinterpret_cast<__gm__ T *>(query->buffer.addr);
+    __gm__ T *key_base = reinterpret_cast<__gm__ T *>(key_cache->buffer.addr);
     __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_batch->buffer.addr);
     __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr);
 
-    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalA = GlobalTensor<T, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<T, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
     using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, Stride<M * N, M * N, M * N, N, 1>>;
 
-    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+    using TileMatA = Tile<TileType::Mat, T, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, T, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
 
-    using LeftTile = TileLeft<half, M, K, M, K>;
-    using RightTile = TileRight<half, K, N, K, N>;
+    using LeftTile = TileLeft<T, M, K, M, K>;
+    using RightTile = TileRight<T, K, N, K, N>;
     using AccTile = TileAcc<float, M, N, M, N>;
 
     TileMatA aMatTile;
@@ -67,22 +72,23 @@ static __aicore__ void qk_matmul_batch_impl(
     TASSIGN(cTile, 0x0);
 
     for (uint64_t b = 0; b < batch_count; b++) {
-        __gm__ half *qi_addr = query_base + ((batch_start + b) * num_heads + q_offset) * K;
+        __gm__ T *qi_addr = query_base + ((batch_start + b) * num_heads + q_offset) * K;
         int32_t phys_block = bt[(batch_start + b) * block_num + block_idx];
-        __gm__ half *kj_addr = key_base + static_cast<uint64_t>(phys_block) * N * K;
+        __gm__ T *kj_addr = key_base + static_cast<uint64_t>(phys_block) * N * K;
         __gm__ float *sij_addr = sij_base + b * M * N;
 
         GlobalA qiGlobal(qi_addr);
         GlobalB kjGlobal(kj_addr);
         GlobalOut sijGlobal(sij_addr);
 
         TLOAD(aMatTile, qiGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
         TLOAD(bMatTile, kjGlobal);
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
 
-        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
         wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
         TMOV(aTile, aMatTile);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
         TMOV(bTile, bMatTile);
 
         set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
@@ -99,6 +105,9 @@ static __aicore__ void qk_matmul_batch_impl(
             pipe_barrier(PIPE_ALL);
         }
     }
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
@@ -113,7 +122,23 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     uint64_t num_heads = static_cast<uint64_t>(args[8]);
     uint64_t batch_start = static_cast<uint64_t>(args[9]);
 
-    qk_matmul_batch_impl<16, 16, 16>(
-        query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads, batch_start
-    );
+    uint64_t q_tile_size = static_cast<uint64_t>(sij_batch->shapes[0] / batch_count);
+    uint64_t block_size = static_cast<uint64_t>(sij_batch->shapes[1]);
+
+    if (q_tile_size == 16 && block_size == 16) {
+        qk_matmul_batch_impl<half, 16, 16, 16>(
+            query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads,
+            batch_start
+        );
+    } else if (q_tile_size == 16) {
+        qk_matmul_batch_impl<bfloat16_t, 16, 128, 128>(
+            query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads,
+            batch_start
+        );
+    } else {
+        qk_matmul_batch_impl<bfloat16_t, 64, 128, 64>(
+            query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads,
+            batch_start
+        );
+    }
 }