InfiniTensor
diff --git a/‎include/infinicore/ops/infllmv2_attention.hpp‎
Lines changed: 59 additions & 138 deletions b/‎include/infinicore/ops/infllmv2_attention.hpp‎
Lines changed: 59 additions & 138 deletions
diff --git a/‎python/infinicore/__init__.py‎
Lines changed: 6 additions & 3 deletions b/‎python/infinicore/__init__.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎python/infinicore/ops/infllmv2_attention.py‎
Lines changed: 13 additions & 9 deletions b/‎python/infinicore/ops/infllmv2_attention.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎python/infinicore/tensor.py‎
Lines changed: 0 additions & 8 deletions b/‎python/infinicore/tensor.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/infinicore/context/allocators/pinnable_block_allocator.cc‎
Lines changed: 0 additions & 15 deletions b/‎src/infinicore/context/allocators/pinnable_block_allocator.cc‎
Lines changed: 0 additions & 15 deletions
@@ -61,59 +61,29 @@ INFINICORE_GRAPH_OP_CLASS(
 //
 // Returns:
 //   [total_q, nheads, head_dim]
-void infllmv2_varlen_(Tensor out,
-                      const Tensor &q,
-                      const Tensor &k,
-                      const Tensor &v,
-                      const Tensor &cu_seqlens_q,
-                      const Tensor &cu_seqlens_k,
-                      int max_seqlen_q,
-                      int max_seqlen_k,
-                      float scale,
-                      bool causal,
-                      int window_size_left = -1,
-                      int window_size_right = -1);
-Tensor infllmv2_varlen(const Tensor &q,
-                       const Tensor &k,
-                       const Tensor &v,
-                       const Tensor &cu_seqlens_q,
-                       const Tensor &cu_seqlens_k,
-                       int max_seqlen_q,
-                       int max_seqlen_k,
-                       float scale,
-                       bool causal,
-                       int window_size_left = -1,
-                       int window_size_right = -1);
-
-// Preferred names (attention-disambiguated). These are header-only aliases to the
-// backward-compatible `infllmv2_*` symbols to avoid adding extra exported ABI.
-inline void infllmv2_attention_varlen_(Tensor out,
-                                       const Tensor &q,
-                                       const Tensor &k,
-                                       const Tensor &v,
-                                       const Tensor &cu_seqlens_q,
-                                       const Tensor &cu_seqlens_k,
-                                       int max_seqlen_q,
-                                       int max_seqlen_k,
-                                       float scale,
-                                       bool causal,
-                                       int window_size_left = -1,
-                                       int window_size_right = -1) {
-    infllmv2_varlen_(out, q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, scale, causal, window_size_left, window_size_right);
-}
-inline Tensor infllmv2_attention_varlen(const Tensor &q,
-                                        const Tensor &k,
-                                        const Tensor &v,
-                                        const Tensor &cu_seqlens_q,
-                                        const Tensor &cu_seqlens_k,
-                                        int max_seqlen_q,
-                                        int max_seqlen_k,
-                                        float scale,
-                                        bool causal,
-                                        int window_size_left = -1,
-                                        int window_size_right = -1) {
-    return infllmv2_varlen(q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, scale, causal, window_size_left, window_size_right);
-}
+void infllmv2_attention_varlen_(Tensor out,
+                                const Tensor &q,
+                                const Tensor &k,
+                                const Tensor &v,
+                                const Tensor &cu_seqlens_q,
+                                const Tensor &cu_seqlens_k,
+                                int max_seqlen_q,
+                                int max_seqlen_k,
+                                float scale,
+                                bool causal,
+                                int window_size_left = -1,
+                                int window_size_right = -1);
+Tensor infllmv2_attention_varlen(const Tensor &q,
+                                 const Tensor &k,
+                                 const Tensor &v,
+                                 const Tensor &cu_seqlens_q,
+                                 const Tensor &cu_seqlens_k,
+                                 int max_seqlen_q,
+                                 int max_seqlen_k,
+                                 float scale,
+                                 bool causal,
+                                 int window_size_left = -1,
+                                 int window_size_right = -1);
 
 // Decode-time InfLLM-V2 attention with KV cache.
 //
@@ -125,104 +95,55 @@ inline Tensor infllmv2_attention_varlen(const Tensor &q,
 //
 // Returns:
 //   [batch, seqlen_q, nheads, head_dim]
-void infllmv2_kvcache_(Tensor out,
-                       const Tensor &q,
-                       const Tensor &k_cache,
-                       const Tensor &v_cache,
-                       const Tensor &cache_lens,
-                       float scale,
-                       bool causal,
-                       int window_size_left = -1,
-                       int window_size_right = -1);
-Tensor infllmv2_kvcache(const Tensor &q,
-                        const Tensor &k_cache,
-                        const Tensor &v_cache,
-                        const Tensor &cache_lens,
-                        float scale,
-                        bool causal,
-                        int window_size_left = -1,
-                        int window_size_right = -1);
+void infllmv2_attention_kvcache_(Tensor out,
+                                 const Tensor &q,
+                                 const Tensor &k_cache,
+                                 const Tensor &v_cache,
+                                 const Tensor &cache_lens,
+                                 float scale,
+                                 bool causal,
+                                 int window_size_left = -1,
+                                 int window_size_right = -1);
+Tensor infllmv2_attention_kvcache(const Tensor &q,
+                                  const Tensor &k_cache,
+                                  const Tensor &v_cache,
+                                  const Tensor &cache_lens,
+                                  float scale,
+                                  bool causal,
+                                  int window_size_left = -1,
+                                  int window_size_right = -1);
 
-inline void infllmv2_attention_kvcache_(Tensor out,
+// Decode-time InfLLM-V2 attention with KV cache, updating cache in-place.
+//
+// Shapes:
+//   q          : [batch, seqlen_q, nheads, head_dim]
+//   k_cache    : [batch, seqlen_cache, nheads_k, head_dim] (dense cache)
+//   v_cache    : same as k_cache
+//   k_new/v_new: [batch, seqlen_new, nheads_k, head_dim] (new KV to append at cache_lens offsets)
+//   cache_lens : [batch] (int32) current KV length per sequence BEFORE appending
+//
+// Returns:
+//   [batch, seqlen_q, nheads, head_dim]
+void infllmv2_attention_kvcache_update_(Tensor out,
                                         const Tensor &q,
                                         const Tensor &k_cache,
                                         const Tensor &v_cache,
+                                        const Tensor &k_new,
+                                        const Tensor &v_new,
                                         const Tensor &cache_lens,
                                         float scale,
                                         bool causal,
                                         int window_size_left = -1,
-                                        int window_size_right = -1) {
-    infllmv2_kvcache_(out, q, k_cache, v_cache, cache_lens, scale, causal, window_size_left, window_size_right);
-}
-inline Tensor infllmv2_attention_kvcache(const Tensor &q,
+                                        int window_size_right = -1);
+Tensor infllmv2_attention_kvcache_update(const Tensor &q,
                                          const Tensor &k_cache,
                                          const Tensor &v_cache,
+                                         const Tensor &k_new,
+                                         const Tensor &v_new,
                                          const Tensor &cache_lens,
                                          float scale,
                                          bool causal,
                                          int window_size_left = -1,
-                                         int window_size_right = -1) {
-    return infllmv2_kvcache(q, k_cache, v_cache, cache_lens, scale, causal, window_size_left, window_size_right);
-}
-
-// Decode-time InfLLM-V2 attention with KV cache, updating cache in-place.
-//
-// Shapes:
-//   q          : [batch, seqlen_q, nheads, head_dim]
-//   k_cache    : [batch, seqlen_cache, nheads_k, head_dim] (dense cache)
-//   v_cache    : same as k_cache
-//   k_new/v_new: [batch, seqlen_new, nheads_k, head_dim] (new KV to append at cache_lens offsets)
-//   cache_lens : [batch] (int32) current KV length per sequence BEFORE appending
-//
-// Returns:
-//   [batch, seqlen_q, nheads, head_dim]
-void infllmv2_kvcache_update_(Tensor out,
-                              const Tensor &q,
-                              const Tensor &k_cache,
-                              const Tensor &v_cache,
-                              const Tensor &k_new,
-                              const Tensor &v_new,
-                              const Tensor &cache_lens,
-                              float scale,
-                              bool causal,
-                              int window_size_left = -1,
-                              int window_size_right = -1);
-Tensor infllmv2_kvcache_update(const Tensor &q,
-                               const Tensor &k_cache,
-                               const Tensor &v_cache,
-                               const Tensor &k_new,
-                               const Tensor &v_new,
-                               const Tensor &cache_lens,
-                               float scale,
-                               bool causal,
-                               int window_size_left = -1,
-                               int window_size_right = -1);
-
-inline void infllmv2_attention_kvcache_update_(Tensor out,
-                                               const Tensor &q,
-                                               const Tensor &k_cache,
-                                               const Tensor &v_cache,
-                                               const Tensor &k_new,
-                                               const Tensor &v_new,
-                                               const Tensor &cache_lens,
-                                               float scale,
-                                               bool causal,
-                                               int window_size_left = -1,
-                                               int window_size_right = -1) {
-    infllmv2_kvcache_update_(out, q, k_cache, v_cache, k_new, v_new, cache_lens, scale, causal, window_size_left, window_size_right);
-}
-inline Tensor infllmv2_attention_kvcache_update(const Tensor &q,
-                                                const Tensor &k_cache,
-                                                const Tensor &v_cache,
-                                                const Tensor &k_new,
-                                                const Tensor &v_new,
-                                                const Tensor &cache_lens,
-                                                float scale,
-                                                bool causal,
-                                                int window_size_left = -1,
-                                                int window_size_right = -1) {
-    return infllmv2_kvcache_update(q, k_cache, v_cache, k_new, v_new, cache_lens, scale, causal, window_size_left, window_size_right);
-}
+                                         int window_size_right = -1);
 
 } // namespace infinicore::op
-
@@ -83,7 +83,10 @@
 from infinicore.ops.hypot import hypot
 from infinicore.ops.index_add import index_add
 from infinicore.ops.index_copy import index_copy
-from infinicore.ops.infllmv2_attention import infllmv2_kvcache, infllmv2_varlen
+from infinicore.ops.infllmv2_attention import (
+    infllmv2_attention_kvcache,
+    infllmv2_attention_varlen,
+)
 from infinicore.ops.inner import inner
 from infinicore.ops.kron import kron
 from infinicore.ops.kthvalue import kthvalue
@@ -195,8 +198,8 @@
     "block_diag",
     "kron",
     "bitwise_right_shift",
-    "infllmv2_varlen",
-    "infllmv2_kvcache",
+    "infllmv2_attention_varlen",
+    "infllmv2_attention_kvcache",
     "simple_gla_attention",
     "simple_gla_decode_step",
     "simple_gla_prefill",
 
@@ -6,17 +6,21 @@
 from infinicore.lib import _infinicore
 from infinicore.tensor import Tensor
 
-_native_infllmv2_varlen = getattr(_infinicore, "infllmv2_varlen", None)
-_native_infllmv2_kvcache = getattr(_infinicore, "infllmv2_kvcache", None)
+_native_infllmv2_attention_varlen = getattr(
+    _infinicore, "infllmv2_attention_varlen", None
+)
+_native_infllmv2_attention_kvcache = getattr(
+    _infinicore, "infllmv2_attention_kvcache", None
+)
 
 _MISSING_MSG = (
-    "infllmv2_varlen / infllmv2_kvcache not found in _infinicore. "
+    "infllmv2_attention_varlen / infllmv2_attention_kvcache not found in _infinicore. "
     "Build InfiniCore with: xmake f --aten=y --infllmv2=y (auto-detect under third_party/infllmv2_cuda_impl) "
     "or --infllmv2=/abs/path/to/libinfllm_v2.so (recommended), then xmake build/install."
 )
 
 
-def infllmv2_varlen(
+def infllmv2_attention_varlen(
     q: Tensor,
     k: Tensor,
     v: Tensor,
@@ -30,10 +34,10 @@ def infllmv2_varlen(
     window_size_right: int = -1,
 ):
     """InfLLM-V2 varlen attention. q,k,v unpadded; cu_seqlens_q/k [batch+1]. Returns [total_q, nheads, head_dim]."""
-    if _native_infllmv2_varlen is None:
+    if _native_infllmv2_attention_varlen is None:
         raise NotImplementedError(_MISSING_MSG)
     return Tensor(
-        _native_infllmv2_varlen(
+        _native_infllmv2_attention_varlen(
             q._underlying,
             k._underlying,
             v._underlying,
@@ -49,7 +53,7 @@ def infllmv2_varlen(
     )
 
 
-def infllmv2_kvcache(
+def infllmv2_attention_kvcache(
     q: Tensor,
     k_cache: Tensor,
     v_cache: Tensor,
@@ -60,10 +64,10 @@ def infllmv2_kvcache(
     window_size_right: int = -1,
 ):
     """InfLLM-V2 KV-cache (decode) attention. Returns [batch, seqlen_q, nheads, head_dim]."""
-    if _native_infllmv2_kvcache is None:
+    if _native_infllmv2_attention_kvcache is None:
         raise NotImplementedError(_MISSING_MSG)
     return Tensor(
-        _native_infllmv2_kvcache(
+        _native_infllmv2_attention_kvcache(
             q._underlying,
             k_cache._underlying,
             v_cache._underlying,
 
@@ -80,14 +80,6 @@ def is_pinned(self):
     def copy_(self, src):
         self._underlying.copy_(src._underlying)
 
-    def write_i32(self, linear_index, value):
-        """Write one int32 element at a contiguous linear index (metadata fast path)."""
-        self._underlying.write_i32(linear_index, int(value))
-
-    def write_i64(self, linear_index, value):
-        """Write one int64 element at a contiguous linear index (metadata fast path)."""
-        self._underlying.write_i64(linear_index, int(value))
-
     def to(self, *args, **kwargs):
         return Tensor(
             self._underlying.to(*tuple(arg._underlying for arg in args), **kwargs)
 
@@ -5,7 +5,6 @@
 #include "../../utils.hpp"
 
 #include <algorithm>
-#include <cstdlib>
 #include <infinirt.h>
 #include <stdexcept>
 
@@ -73,13 +72,6 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
             block->frozen = pinned_mode_;
             block->in_use = true;
 
-            if (std::getenv("INFINICORE_DEBUG_ALLOC") != nullptr) {
-                infiniDevice_t dev;
-                int dev_id;
-                infinirtGetDevice(&dev, &dev_id);
-                spdlog::warn("PinnableBlockAllocator cudaMalloc request: requested={} aligned={} class={} device={} id={}",
-                             size, size, cls.block_size, static_cast<int>(dev), dev_id);
-            }
             INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
 
             all_blocks_[block->ptr] = block;
@@ -105,13 +97,6 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
     block->frozen = pinned_mode_;
     block->in_use = true;
 
-    if (std::getenv("INFINICORE_DEBUG_ALLOC") != nullptr) {
-        infiniDevice_t dev;
-        int dev_id;
-        infinirtGetDevice(&dev, &dev_id);
-        spdlog::warn("PinnableBlockAllocator cudaMalloc request (large): requested={} aligned={} device={} id={}",
-                     size, size, static_cast<int>(dev), dev_id);
-    }
     INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
 
     large_blocks_.push_back(block);