PPPoint-t
diff --git a/‎include/infinicore/common/hash.hpp‎
Lines changed: 10 additions & 0 deletions b/‎include/infinicore/common/hash.hpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/infinicore/context/context.hpp‎
Lines changed: 8 additions & 0 deletions b/‎include/infinicore/context/context.hpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/infinicore/graph/graph.hpp‎
Lines changed: 93 additions & 0 deletions b/‎include/infinicore/graph/graph.hpp‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎include/infinicore/nn/rope.hpp‎
Lines changed: 50 additions & 6 deletions b/‎include/infinicore/nn/rope.hpp‎
Lines changed: 50 additions & 6 deletions
diff --git a/‎include/infinicore/ops.hpp‎
Lines changed: 5 additions & 0 deletions b/‎include/infinicore/ops.hpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/infinicore/ops/add_rms_norm.hpp‎
Lines changed: 20 additions & 0 deletions b/‎include/infinicore/ops/add_rms_norm.hpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎include/infinicore/ops/gemm.hpp‎
Lines changed: 2 additions & 6 deletions b/‎include/infinicore/ops/gemm.hpp‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎include/infinicore/ops/paged_attention.hpp‎
Lines changed: 18 additions & 0 deletions b/‎include/infinicore/ops/paged_attention.hpp‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎include/infinicore/ops/paged_attention_prefill.hpp‎
Lines changed: 52 additions & 0 deletions b/‎include/infinicore/ops/paged_attention_prefill.hpp‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎include/infinicore/ops/paged_caching.hpp‎
Lines changed: 17 additions & 0 deletions b/‎include/infinicore/ops/paged_caching.hpp‎
Lines changed: 17 additions & 0 deletions
@@ -2,6 +2,7 @@
 
 #include "../tensor.hpp"
 
+#include <optional>
 #include <type_traits>
 
 namespace infinicore {
@@ -24,6 +25,15 @@ inline void hash_combine(size_t &seed, Tensor tensor) {
     }
 }
 
+// Specialization for optional
+template <typename T>
+inline void hash_combine(size_t &seed, const std::optional<T> &opt) {
+    hash_combine(seed, opt.has_value());
+    if (opt) {
+        hash_combine(seed, *opt);
+    }
+}
+
 // Specialization for std::string
 inline void hash_combine(size_t &seed, const std::string &str) {
     hash_combine(seed, std::hash<std::string>{}(str));
 
@@ -3,6 +3,8 @@
 #include "../device.hpp"
 #include "../memory.hpp"
 
+#include "../graph/graph.hpp"
+
 #include <infiniop.h>
 #include <infinirt.h>
 
@@ -40,6 +42,12 @@ void destroyEvent(infinirtEvent_t event);
 float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
 void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);
 
+// Graph recording APIs
+bool isGraphRecording();
+void startGraphRecording();
+void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
+std::shared_ptr<graph::Graph> stopGraphRecording();
+
 } // namespace context
 
 } // namespace infinicore
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "../tensor.hpp"
+
+namespace infinicore::graph {
+// Forward declarations
+class GraphManager;
+
+class GraphTensor : public Tensor {
+public:
+    GraphTensor(const Tensor &);
+    void resume() const;
+};
+
+class GraphOperator {
+
+public:
+    void run() const;
+    ~GraphOperator();
+
+protected:
+    using run_schema = void (*)(void *);
+    using cleanup_schema = void (*)(void **);
+    void *planned_meta_;
+    run_schema runner_;
+    cleanup_schema deleter_;
+};
+
+class Graph {
+public:
+    Graph() = default;
+    ~Graph() = default;
+
+    void run() const;
+
+protected:
+    void add_operator(std::shared_ptr<GraphOperator> op);
+
+    std::vector<std::shared_ptr<GraphOperator>> op_list_;
+
+    friend class GraphManager;
+};
+} // namespace infinicore::graph
+
+#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...)                        \
+    class __OP_NAME__ : public graph::GraphOperator {                      \
+    public:                                                                \
+        using schema = void (*)(__VA_ARGS__);                              \
+        using plan_schema = void *(*)(__VA_ARGS__);                        \
+        static common::OpDispatcher<plan_schema> &plan_dispatcher();       \
+        static common::OpDispatcher<run_schema> &run_dispatcher();         \
+        static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher(); \
+        __OP_NAME__(__VA_ARGS__);                                          \
+        static void execute(__VA_ARGS__);                                  \
+    };
+
+#define INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(__OP_NAME__)                                  \
+    common::OpDispatcher<__OP_NAME__::plan_schema> &__OP_NAME__::plan_dispatcher() {       \
+        static common::OpDispatcher<__OP_NAME__::plan_schema> dispatcher_;                 \
+        return dispatcher_;                                                                \
+    }                                                                                      \
+    common::OpDispatcher<__OP_NAME__::run_schema> &__OP_NAME__::run_dispatcher() {         \
+        static common::OpDispatcher<__OP_NAME__::run_schema> dispatcher_;                  \
+        return dispatcher_;                                                                \
+    }                                                                                      \
+    common::OpDispatcher<__OP_NAME__::cleanup_schema> &__OP_NAME__::cleanup_dispatcher() { \
+        static common::OpDispatcher<__OP_NAME__::cleanup_schema> dispatcher_;              \
+        return dispatcher_;                                                                \
+    }
+
+#define INFINICORE_GRAPH_OP_DISPATCH(__DEVICE_TYPE__, ...)                  \
+    planned_meta_ = plan_dispatcher().lookup(__DEVICE_TYPE__)(__VA_ARGS__); \
+    runner_ = run_dispatcher().lookup(__DEVICE_TYPE__);                     \
+    deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
+
+#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
+    auto op = std::make_shared<__OP_NAME__>(__VA_ARGS__);   \
+    if (context::isGraphRecording()) {                      \
+        context::addGraphOperator(op);                      \
+    } else {                                                \
+        op->run();                                          \
+    }
+
+#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
+    static bool registered = []() {                                                               \
+        __OP_NAME__::plan_dispatcher().registerAll(__PLAN_F__, false);                            \
+        __OP_NAME__::run_dispatcher().registerAll(__RUN_F__, false);                              \
+        __OP_NAME__::cleanup_dispatcher().registerAll(__CLEANUP_F__, false);                      \
+        return true;                                                                              \
+    }();
@@ -17,6 +17,47 @@ class RoPE : public Module {
         GPT_NEOX = 1, // GPT-NeoX style RoPE algorithm (First half dimensions for sin, second half for cos)
     };
 
+    enum class ScalingType {
+        DEFAULT = 0, // Default RoPE
+        LONGROPE = 1 // Long-RoPE
+    };
+
+    class ScalingConfig {
+    public:
+        virtual ~ScalingConfig() = default;
+        ScalingType type() const { return type_; }
+
+    protected:
+        ScalingType type_ = ScalingType::DEFAULT;
+        ScalingConfig(ScalingType type) : type_(type) {}
+    };
+
+    // longrope scaling
+    class LongRopeConfig : public ScalingConfig {
+    protected:
+        std::vector<float> short_factor_;
+        std::vector<float> long_factor_;
+        size_t original_max_position_embeddings_;
+        float factor_;
+
+    public:
+        LongRopeConfig(
+            std::vector<float> short_factor,
+            std::vector<float> long_factor,
+            size_t original_max_position_embeddings,
+            float factor = 1.0f)
+            : ScalingConfig(ScalingType::LONGROPE),
+              short_factor_(short_factor),
+              long_factor_(long_factor),
+              original_max_position_embeddings_(original_max_position_embeddings),
+              factor_(factor == 1.0f ? 1.0f : std::sqrt(1 + std::log(factor) / std::log(original_max_position_embeddings))) {}
+        ~LongRopeConfig() override = default;
+        size_t original_max_position_embeddings() const { return original_max_position_embeddings_; }
+        const std::vector<float> &short_factor() const { return short_factor_; }
+        const std::vector<float> &long_factor() const { return long_factor_; }
+        float factor() const { return factor_; }
+    };
+
     /**
      * @brief Construct a RoPE layer
      *
@@ -26,13 +67,15 @@ class RoPE : public Module {
      * @param algo RoPE algorithm type (default: Algo::GPT_J)
      * @param dtype Data type for sin/cos cache (default: DataType::F32)
      * @param device Device to create the cache on
+     * @param scaling RoPE scaling type (default: nullptr)
      */
     RoPE(size_t head_dim,
          size_t max_seq_len,
          double theta = 10000.0,
          Algo algo = Algo::GPT_J,
          const DataType &dtype = DataType::F32,
-         const Device &device = Device());
+         const Device &device = Device(),
+         std::shared_ptr<ScalingConfig> scaling = nullptr);
 
     /**
      * @brief Forward pass: apply RoPE to a tensor
@@ -88,11 +131,12 @@ class RoPE : public Module {
 private:
     void initialize_cache();
 
-    size_t head_dim_;    // Dimension of each attention head
-    size_t max_seq_len_; // Maximum sequence length
-    double theta_;       // Base frequency for rotary embeddings
-    Algo algo_;          // RoPE algorithm type
-    DataType dtype_;     // Data type for cache tables
+    size_t head_dim_;                        // Dimension of each attention head
+    size_t max_seq_len_;                     // Maximum sequence length
+    double theta_;                           // Base frequency for rotary embeddings
+    Algo algo_;                              // RoPE algorithm type
+    DataType dtype_;                         // Data type for cache tables
+    std::shared_ptr<ScalingConfig> scaling_; // RoPE scaling type
 };
 
 } // namespace infinicore::nn
@@ -1,10 +1,15 @@
 #pragma once
 
 #include "ops/add.hpp"
+#include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
+#include "ops/paged_attention.hpp"
+#include "ops/paged_attention_prefill.hpp"
+#include "ops/paged_caching.hpp"
+#include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <utility>
+
+namespace infinicore::op {
+class AddRMSNorm {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, float);
+    static void execute(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// Fused Add and RMS Normalization
+// Returns: (normalized_result, add_result)
+// The add_result can be used as residual for subsequent layers
+std::pair<Tensor, Tensor> add_rms_norm(Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+void add_rms_norm_(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
+} // namespace infinicore::op
@@ -1,16 +1,12 @@
 #pragma once
 
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
-class Gemm {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor, float, float);
-    static void execute(Tensor c, Tensor a, Tensor b, float alpha, float beta);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Gemm, Tensor, Tensor, Tensor, float, float);
 
 Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f);
 void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta);
 
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+class PagedAttention {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
+    static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale);
+void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale);
+} // namespace infinicore::op
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+class PagedAttentionPrefill {
+public:
+    /**
+     * @brief PagedAttentionPrefill operator signature
+     * * Argument order:
+     * 1. out: Output tensor (Packed format)
+     * 2. q: Current Query tensor (Packed format)
+     * 3. k_cache: Physical Key cache (Paged format)
+     * 4. v_cache: Physical Value cache (Paged format)
+     * 5. block_tables: Mapping table from logical blocks to physical blocks
+     * 6. total_kv_lens:  lengths of Complete Key/Value for each request
+     * 7. cu_seqlens_q: Cumulative sequence lengths of Query (prefix sum for variable-length batch)
+     * 8. alibi_slopes: ALiBi bias slopes (optional)
+     * 9. scale: Scaling factor (typically 1/sqrt(head_size))
+     */
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
+
+    static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
+                        Tensor block_tables, Tensor total_kv_lens, Tensor cum_seqlens_q,
+                        std::optional<Tensor> alibi_slopes, float scale);
+
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor paged_attention_prefill(Tensor q,
+                               Tensor k_cache,
+                               Tensor v_cache,
+                               Tensor block_tables,
+                               Tensor total_kv_lens,
+                               Tensor cum_seqlens_q,
+                               std::optional<Tensor> alibi_slopes,
+                               float scale);
+
+void paged_attention_prefill_(Tensor out,
+                              Tensor q,
+                              Tensor k_cache,
+                              Tensor v_cache,
+                              Tensor block_tables,
+                              Tensor total_kv_lens,
+                              Tensor cum_seqlens_q,
+                              std::optional<Tensor> alibi_slopes,
+                              float scale);
+
+} // namespace infinicore::op
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class PagedCaching {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor);
+    static void execute(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+void paged_caching_(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping);
+
+} // namespace infinicore::op