InfiniTensor
diff --git a/‎include/infinicore/ops/simple_gla_attention.hpp‎
Lines changed: 10 additions & 10 deletions b/‎include/infinicore/ops/simple_gla_attention.hpp‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎include/infiniop/ops/simple_gla_attention.h‎
Lines changed: 38 additions & 0 deletions b/‎include/infiniop/ops/simple_gla_attention.h‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎python/infinicore/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎python/infinicore/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/infinicore/ops/simple_gla_attention/simple_gla_attention.cc‎
Lines changed: 19 additions & 158 deletions b/‎src/infinicore/ops/simple_gla_attention/simple_gla_attention.cc‎
Lines changed: 19 additions & 158 deletions
diff --git a/‎src/infinicore/ops/simple_gla_attention/simple_gla_attention_infiniop.cc‎
Lines changed: 81 additions & 0 deletions b/‎src/infinicore/ops/simple_gla_attention/simple_gla_attention_infiniop.cc‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎src/infinicore/ops/simple_gla_attention/simple_gla_attention_nvidia.cc‎
Lines changed: 0 additions & 30 deletions b/‎src/infinicore/ops/simple_gla_attention/simple_gla_attention_nvidia.cc‎
Lines changed: 0 additions & 30 deletions
@@ -2,23 +2,23 @@
 
 #include "../device.hpp"
 #include "../graph/graph.hpp"
+#include "../tensor.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
+INFINICORE_GRAPH_OP_CLASS(SimpleGlaAttention,
+                          Tensor,
+                          const Tensor &,
+                          const Tensor &,
+                          const Tensor &,
+                          const Tensor &,
+                          float);
+
 // Simple GLA (recurrent linear) attention with per-head decay.
 // Shapes: q, k, v [B, T, H, D], g_gamma [H] (log-decay per head).
 // Recurrence: gate = exp(g_gamma); S = S * gate + outer(k_t, v_t); o_t = (q_t * scale) @ S.
-// Returns [B, T, H, D].
-class SimpleGlaAttention {
-public:
-    using schema = void (*)(Tensor & out, const Tensor &q, const Tensor &k, const Tensor &v,
-                           const Tensor &g_gamma, float scale);
-    static void execute(Tensor & out, const Tensor &q, const Tensor &k, const Tensor &v,
-                        const Tensor &g_gamma, float scale);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
+// Returns [B, T, H, D]. Implementation lives in InfiniOP (CPU reference; NVIDIA uses simple_gla_prefill kernels).
 Tensor simple_gla_attention(const Tensor &q,
                             const Tensor &k,
                             const Tensor &v,
 
@@ -0,0 +1,38 @@
+#ifndef __INFINIOP_SIMPLE_GLA_ATTENTION_API_H__
+#define __INFINIOP_SIMPLE_GLA_ATTENTION_API_H__
+
+#include "../operator_descriptor.h"
+
+// Full-sequence Simple GLA attention forward (reference CPU + NVIDIA via prefill kernels).
+// q, k, v: [B, T, H, D] (F32/F16/BF16), g_gamma: [H] (F32), out: [B, T, H, D] (same dtype as q)
+typedef struct InfiniopDescriptor *infiniopSimpleGLAAttentionDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateSimpleGLAAttentionDescriptor(
+    infiniopHandle_t handle,
+    infiniopSimpleGLAAttentionDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_desc,
+    infiniopTensorDescriptor_t v_desc,
+    infiniopTensorDescriptor_t g_gamma_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetSimpleGLAAttentionWorkspaceSize(
+    infiniopSimpleGLAAttentionDescriptor_t desc,
+    size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopSimpleGLAAttention(
+    infiniopSimpleGLAAttentionDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    void const *q,
+    void const *k,
+    void const *v,
+    void const *g_gamma,
+    float scale,
+    void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroySimpleGLAAttentionDescriptor(
+    infiniopSimpleGLAAttentionDescriptor_t desc);
+
+#endif
@@ -108,10 +108,10 @@
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.reciprocal import reciprocal
 from infinicore.ops.scatter import scatter
-from infinicore.ops.sinh import sinh
 from infinicore.ops.simple_gla_attention import simple_gla_attention
 from infinicore.ops.simple_gla_decode_step import simple_gla_decode_step
 from infinicore.ops.simple_gla_prefill import simple_gla_prefill
+from infinicore.ops.sinh import sinh
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.sum import sum
 from infinicore.ops.take import take
 
@@ -1,166 +1,27 @@
 #include "infinicore/ops/simple_gla_attention.hpp"
-
-#include "../../../utils.h"
 #include "../../utils.hpp"
-#include "infinicore/context/context.hpp"
-#include <cmath>
-#include <cstring>
-#include <stdexcept>
-#include <vector>
 
 namespace infinicore::op {
 
-namespace {
-
-// Read one element from tensor at flat index, convert to float.
-template <typename T>
-inline float read_float(const std::byte *ptr, size_t idx) {
-    return static_cast<float>(*reinterpret_cast<const T *>(ptr + idx * sizeof(T)));
-}
-
-inline float read_float_at(const std::byte *ptr, size_t idx, DataType dtype) {
-    switch (dtype) {
-    case DataType::F32:
-        return read_float<float>(ptr, idx);
-    case DataType::F16:
-        return _f16_to_f32(*reinterpret_cast<const fp16_t *>(ptr + idx * 2));
-    case DataType::BF16:
-        return _bf16_to_f32(*reinterpret_cast<const bf16_t *>(ptr + idx * 2));
-    default:
-        throw std::runtime_error("simple_gla_attention: unsupported dtype (need F32, F16, or BF16)");
-    }
-}
-
-// Write one float to tensor at flat index.
-inline void write_float_at(std::byte *ptr, size_t idx, DataType dtype, float val) {
-    switch (dtype) {
-    case DataType::F32:
-        *reinterpret_cast<float *>(ptr + idx * 4) = val;
-        break;
-    case DataType::F16:
-        *reinterpret_cast<fp16_t *>(ptr + idx * 2) = _f32_to_f16(val);
-        break;
-    case DataType::BF16:
-        *reinterpret_cast<bf16_t *>(ptr + idx * 2) = _f32_to_bf16(val);
-        break;
-    default:
-        throw std::runtime_error("simple_gla_attention: unsupported dtype (need F32, F16, or BF16)");
-    }
-}
-
-void simple_gla_attention_cpu_impl(Tensor &out,
-                                   const Tensor &q,
-                                   const Tensor &k,
-                                   const Tensor &v,
-                                   const Tensor &g_gamma,
-                                   float scale) {
-    const auto &q_shape = q->shape();
-    const size_t B = q_shape[0];
-    const size_t T = q_shape[1];
-    const size_t H = q_shape[2];
-    const size_t D = q_shape[3];
-
-    INFINICORE_ASSERT(k->shape() == q_shape && v->shape() == q_shape);
-    INFINICORE_ASSERT(g_gamma->shape().size() == 1 && g_gamma->shape()[0] == H);
-
-    const DataType dtype = q->dtype();
-    const std::byte *q_ptr = q->data();
-    const std::byte *k_ptr = k->data();
-    const std::byte *v_ptr = v->data();
-    const std::byte *g_ptr = g_gamma->data();
-    std::byte *out_ptr = out->data();
-
-    // Contiguous layout (B, T, H, D): index (b,t,h,d) = b*T*H*D + t*H*D + h*D + d
-    const size_t stride_b = T * H * D;
-    const size_t stride_t = H * D;
-    const size_t stride_h = D;
-
-    // Gate (H,) in float
-    std::vector<float> gate(H);
-    for (size_t h = 0; h < H; ++h) {
-        gate[h] = std::exp(read_float_at(g_ptr, h, g_gamma->dtype()));
-    }
-
-    // State S: (B, H, D, D) in float, row-major
-    std::vector<float> S(B * H * D * D, 0.f);
-
-    for (size_t t = 0; t < T; ++t) {
-        const size_t t_offset = t * stride_t;
-
-        // 1. S = S * gate + outer(k_t, v_t)
-        // k_t (b,h,d_k), v_t (b,h,d_v) -> kv(b,h,d_k,d_v) = k_t(b,h,d_k) * v_t(b,h,d_v)
-        for (size_t b = 0; b < B; ++b) {
-            const size_t b_offset = b * stride_b + t_offset;
-            for (size_t h = 0; h < H; ++h) {
-                const float g = gate[h];
-                float *S_bh = S.data() + (b * H + h) * (D * D);
-
-                // Scale S by gate
-                for (size_t i = 0; i < D * D; ++i) {
-                    S_bh[i] *= g;
-                }
-
-                // Add outer(k_t, v_t)
-                for (size_t dk = 0; dk < D; ++dk) {
-                    size_t qk_idx = b_offset + h * stride_h + dk;
-                    float k_val = read_float_at(k_ptr, qk_idx, dtype);
-                    for (size_t dv = 0; dv < D; ++dv) {
-                        size_t qv_idx = b_offset + h * stride_h + dv;
-                        float v_val = read_float_at(v_ptr, qv_idx, dtype);
-                        S_bh[dk * D + dv] += k_val * v_val;
-                    }
-                }
-            }
-        }
-
-        // 2. o_t = (q_t * scale) @ S  -> (B, H, D) for each (b,h): o[b,h,:] = scale * (q_t[b,h,:] @ S[b,h,:,:])
-        for (size_t b = 0; b < B; ++b) {
-            const size_t b_offset = b * stride_b + t_offset;
-            for (size_t h = 0; h < H; ++h) {
-                const float *S_bh = S.data() + (b * H + h) * (D * D);
-                for (size_t dv = 0; dv < D; ++dv) {
-                    float acc = 0.f;
-                    for (size_t dk = 0; dk < D; ++dk) {
-                        size_t q_idx = b_offset + h * stride_h + dk;
-                        float q_val = read_float_at(q_ptr, q_idx, dtype) * scale;
-                        acc += q_val * S_bh[dk * D + dv];
-                    }
-                    size_t out_idx = b_offset + h * stride_h + dv;
-                    write_float_at(out_ptr, out_idx, dtype, acc);
-                }
-            }
-        }
-    }
-}
-
-void simple_gla_attention_cpu_calculate(Tensor &out, const Tensor &q, const Tensor &k,
-                                        const Tensor &v, const Tensor &g_gamma, float scale) {
-    simple_gla_attention_cpu_impl(out, q, k, v, g_gamma, scale);
-}
-
-static bool register_cpu = []() {
-    SimpleGlaAttention::dispatcher().registerDevice(Device::Type::CPU, &simple_gla_attention_cpu_calculate,
-                                                    false);
-    return true;
-}();
-
-} // namespace
-
-common::OpDispatcher<SimpleGlaAttention::schema> &SimpleGlaAttention::dispatcher() {
-    static common::OpDispatcher<SimpleGlaAttention::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void SimpleGlaAttention::execute(Tensor &out, const Tensor &q, const Tensor &k, const Tensor &v,
-                                 const Tensor &g_gamma, float scale) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(q, k, v, g_gamma);
-    infinicore::context::setDevice(q->device());
-    auto device_type = infinicore::context::getDevice().getType();
-    auto func = dispatcher().lookup(device_type);
-    if (func == nullptr) {
-        throw std::runtime_error("simple_gla_attention: no implementation for device type " + std::to_string(static_cast<int>(device_type)));
-    }
-    func(out, q, k, v, g_gamma, scale);
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(SimpleGlaAttention);
+
+SimpleGlaAttention::SimpleGlaAttention(Tensor out,
+                                       const Tensor &q,
+                                       const Tensor &k,
+                                       const Tensor &v,
+                                       const Tensor &g_gamma,
+                                       float scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k, v, g_gamma);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, q, k, v, g_gamma, scale);
+}
+
+void SimpleGlaAttention::execute(Tensor out,
+                                 const Tensor &q,
+                                 const Tensor &k,
+                                 const Tensor &v,
+                                 const Tensor &g_gamma,
+                                 float scale) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(SimpleGlaAttention, out, q, k, v, g_gamma, scale);
 }
 
 Tensor simple_gla_attention(const Tensor &q,
 
@@ -0,0 +1,81 @@
+#include "infinicore/ops/simple_gla_attention.hpp"
+
+#include "../infiniop_impl.hpp"
+#include "infinicore/context/context.hpp"
+
+#include <infiniop/ops/simple_gla_attention.h>
+
+namespace infinicore::op::simple_gla_attention_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, SimpleGLAAttention, 64);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace;
+    graph::GraphTensor out;
+    graph::GraphTensor q;
+    graph::GraphTensor k;
+    graph::GraphTensor v;
+    graph::GraphTensor g;
+    float scale;
+};
+
+static void *plan(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &g, float scale) {
+    size_t key = hash_combine(out, q, k, v, g, static_cast<size_t>(scale * 1000000.0f));
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, SimpleGLAAttention,
+        key, out->desc(), q->desc(), k->desc(), v->desc(), g->desc());
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(
+        infiniopGetSimpleGLAAttentionWorkspaceSize(descriptor->desc, &workspace_size));
+
+    thread_local common::OpCache<size_t, Tensor> workspace_caches(8 /*capacity*/);
+    auto device__ = context::getDevice();
+    auto &cache__ = workspace_caches.getCache(device__);
+
+    Tensor workspace;
+    if (auto cached = cache__.get(workspace_size); cached.has_value()) {
+        workspace = *cached;
+    } else {
+        workspace = Tensor::empty({workspace_size}, DataType::U8, device__);
+        cache__.put(workspace_size, workspace);
+    }
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(out),
+        graph::GraphTensor(q),
+        graph::GraphTensor(k),
+        graph::GraphTensor(v),
+        graph::GraphTensor(g),
+        scale,
+    };
+}
+
+static void run(void *planned_meta) {
+    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
+    INFINICORE_CHECK_ERROR(
+        infiniopSimpleGLAAttention(
+            p->descriptor->desc,
+            p->workspace->data(),
+            p->workspace->numel(),
+            p->out->data(),
+            p->q->data(),
+            p->k->data(),
+            p->v->data(),
+            p->g->data(),
+            p->scale,
+            context::getStream()));
+}
+
+static void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(SimpleGlaAttention, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::simple_gla_attention_impl::infiniop