InfiniTensor
diff --git a/‎.gitmodules‎
Lines changed: 4 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 4 deletions b/‎README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎include/infinicore.h‎
Lines changed: 1 addition & 0 deletions b/‎include/infinicore.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/infinicore.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/infinicore.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/infinicore/device.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/infinicore/device.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/infinicore/graph/graph.hpp‎
Lines changed: 14 additions & 9 deletions b/‎include/infinicore/graph/graph.hpp‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎include/infinicore/nn/linear.hpp‎
Lines changed: 22 additions & 0 deletions b/‎include/infinicore/nn/linear.hpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/infinicore/nn/rmsnorm.hpp‎
Lines changed: 19 additions & 4 deletions b/‎include/infinicore/nn/rmsnorm.hpp‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎include/infinicore/ops.hpp‎
Lines changed: 4 additions & 0 deletions b/‎include/infinicore/ops.hpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/infinicore/ops/add.hpp‎
Lines changed: 6 additions & 9 deletions b/‎include/infinicore/ops/add.hpp‎
Lines changed: 6 additions & 9 deletions
@@ -1,3 +1,7 @@
 [submodule "third_party/spdlog"]
 	path = third_party/spdlog
 	url = https://github.com/gabime/spdlog.git
+[submodule "third_party/nlohmann_json"]
+	path = third_party/nlohmann_json
+	url = https://github.com/nlohmann/json.git
+	branch = master
@@ -20,6 +20,7 @@ InfiniCore 是一个跨平台统一编程工具集，为不同芯片平台的功
   - 天数智芯 GPU；
   - 沐曦 GPU；
   - 海光 DCU；
+  - 阿里 PPU；
 - 华为昇腾 NPU；
 - 寒武纪 MLU；
 - 昆仑芯 XPU；
@@ -103,6 +104,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
 | `--qy-gpu=[y\|n]`        | 是否编译QY GPU 接口实现           | n
 | `--hygon-dcu=[y\|n]`     | 是否编译海光 DCU 接口实现         | n
 | `--kunlun-xpu=[y\|n]`    | 是否编译昆仑 XPU 接口实现         | n
+| `--ali-ppu=[y\|n]`       | 是否编译阿里 PPU 接口实现         | n
 | `--ninetoothed=[y\|n]`   | 是否编译九齿实现                 | n
 | `--ccl=[y\|n]`           | 是否编译 InfiniCCL 通信库接口实现 | n
 
@@ -187,9 +189,9 @@ pip install -e .
 
 ```bash
 # 测试单算子
-python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
+python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
 # 测试全部算子
-python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun]
+python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --ali]
 ```
 
 使用 -h 查看更多参数。
@@ -198,9 +200,9 @@ python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia
 
 ```shell
 # 测试单算子
-python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
+python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
 # 测试全部算子
-python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
+python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
 ```
 
 #### 通信库（InfiniCCL）测试
 
@@ -47,6 +47,7 @@ typedef enum {
     INFINI_DEVICE_KUNLUN = 7,
     INFINI_DEVICE_HYGON = 8,
     INFINI_DEVICE_QY = 9,
+    INFINI_DEVICE_ALI = 10,
     INFINI_DEVICE_TYPE_COUNT
 } infiniDevice_t;
 
 
@@ -3,4 +3,5 @@
 #include "infinicore/device_event.hpp"
 #include "infinicore/nn.hpp"
 #include "infinicore/ops.hpp"
+#include "infinicore/quantization.hpp"
 #include "infinicore/tensor.hpp"
@@ -22,6 +22,7 @@ class Device {
         KUNLUN = INFINI_DEVICE_KUNLUN,
         HYGON = INFINI_DEVICE_HYGON,
         QY = INFINI_DEVICE_QY,
+        ALI = INFINI_DEVICE_ALI,
         COUNT = INFINI_DEVICE_TYPE_COUNT,
     };
 
 
@@ -15,10 +15,15 @@ class GraphTensor : public Tensor {
 };
 
 class GraphOperator {
+public:
+    virtual void run() const = 0;
+    virtual ~GraphOperator() = default;
+};
 
+class DispatchableGraphOperator : public GraphOperator {
 public:
-    void run() const;
-    ~GraphOperator();
+    void run() const override;
+    ~DispatchableGraphOperator() override;
 
 protected:
     using run_schema = void (*)(void *);
@@ -49,7 +54,7 @@ class Graph {
 } // namespace infinicore::graph
 
 #define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...)                        \
-    class __OP_NAME__ : public graph::GraphOperator {                      \
+    class __OP_NAME__ : public graph::DispatchableGraphOperator {          \
     public:                                                                \
         using schema = void (*)(__VA_ARGS__);                              \
         using plan_schema = void *(*)(__VA_ARGS__);                        \
@@ -79,12 +84,12 @@ class Graph {
     runner_ = run_dispatcher().lookup(__DEVICE_TYPE__);                     \
     deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
 
-#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
-    auto op = std::make_shared<__OP_NAME__>(__VA_ARGS__);   \
-    if (context::isGraphRecording()) {                      \
-        context::addGraphOperator(op);                      \
-    } else {                                                \
-        op->run();                                          \
+#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...)  \
+    auto ___op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \
+    if (context::isGraphRecording()) {                       \
+        context::addGraphOperator(___op);                    \
+    } else {                                                 \
+        ___op->run();                                        \
     }
 
 #define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
 
@@ -1,8 +1,10 @@
 #pragma once
 
 #include "../ops.hpp"
+#include "../quantization.hpp"
 #include "module.hpp"
 #include <infiniccl.h>
+#include <optional>
 
 namespace infinicore::nn {
 
@@ -11,6 +13,9 @@ class BaseLinear : public Module {
     BaseLinear(size_t in_features, size_t out_features, bool bias = true,
                const DataType &dtype = DataType::F32, const Device &device = Device());
 
+    BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+               const DataType &dtype = DataType::F32, const Device &device = Device());
+
     // Forward pass: output = input @ weight.T + bias
     Tensor forward(Tensor &input) const;
 
@@ -27,12 +32,17 @@ class BaseLinear : public Module {
     // Accessors for parameters
     Tensor weight() const { return weight_; }
     Tensor bias() const { return bias_; }
+    Tensor weight_scale() const { return weight_scale_; }
+    Tensor weight_zeros() const { return weight_zeros_; }
 
 protected:
     // Parameters
     INFINICORE_NN_PARAMETER(weight);
     INFINICORE_NN_PARAMETER(bias);
 
+    INFINICORE_NN_PARAMETER(weight_scale);
+    INFINICORE_NN_PARAMETER(weight_zeros);
+
 protected:
     // Helper method for common forward computation
     Tensor compute_linear(Tensor &input) const;
@@ -41,6 +51,7 @@ class BaseLinear : public Module {
     size_t out_features_;
     bool has_bias_;
     DataType dtype_;
+    std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_ = std::make_shared<infinicore::quantization::NoneQuantization>(nullptr);
 };
 
 } // namespace infinicore::nn
@@ -52,6 +63,9 @@ class Linear : public BaseLinear {
     Linear(size_t in_features, size_t out_features, bool bias = true,
            const DataType &dtype = DataType::F32, const Device &device = Device());
 
+    Linear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+           const DataType &dtype = DataType::F32, const Device &device = Device());
+
     // Forward pass: output = input @ weight.T + bias
     Tensor forward(Tensor &input) const;
 
@@ -65,6 +79,10 @@ class ColumnParallelLinear : public BaseLinear {
                          const DataType &dtype = DataType::F32, const Device &device = Device(),
                          Size tp_rank = 0, Size tp_size = 1);
 
+    ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+                         const DataType &dtype = DataType::F32, const Device &device = Device(),
+                         Size tp_rank = 0, Size tp_size = 1);
+
     // Forward pass: output = input @ weight.T + bias
     Tensor forward(Tensor &input) const;
 
@@ -82,6 +100,10 @@ class RowParallelLinear : public BaseLinear {
                       const DataType &dtype = DataType::F32, const Device &device = Device(),
                       Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
 
+    RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
+                      const DataType &dtype = DataType::F32, const Device &device = Device(),
+                      Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
+
     // Forward pass: output = input @ weight.T + bias
     Tensor forward(Tensor &input) const;
 
 
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "module.hpp"
 #include "../ops.hpp"
+#include "module.hpp"
 
 namespace infinicore::nn {
 
@@ -57,6 +57,21 @@ class RMSNorm : public Module {
      */
     Tensor forward(const Tensor &x) const;
 
+    /**
+     * @brief Forward pass: apply RMSNorm in-place with residual
+     *
+     * @param x Input tensor of shape (*, normalized_shape) where * is any number of dimensions.
+     *       Will be modified in-place to the normalized output.
+     * @param residual Residual tensor to add to input before normalization.
+     *       Will be modified in-place to the sum of input and residual.
+     *
+     * The normalization is applied over the last dimension.
+     * For example:
+     *   Input: [batch, seq_len, hidden_size] -> normalize over hidden_size
+     *   Input: [batch, hidden_size] -> normalize over hidden_size
+     */
+    void forward_inplace(Tensor &x, Tensor &residual) const;
+
     // Module information
     size_t normalized_shape() const { return normalized_shape_; }
     double eps() const { return eps_; }
@@ -73,9 +88,9 @@ class RMSNorm : public Module {
     INFINICORE_NN_PARAMETER(weight);
 
 private:
-    size_t normalized_shape_;  // Size of the feature dimension
-    double eps_;               // Epsilon for numerical stability
-    DataType dtype_;           // Data type for weight
+    size_t normalized_shape_; // Size of the feature dimension
+    double eps_;              // Epsilon for numerical stability
+    DataType dtype_;          // Data type for weight
 };
 
 } // namespace infinicore::nn
@@ -4,6 +4,9 @@
 #include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/embedding.hpp"
+#include "ops/flash_attention.hpp"
+#include "ops/kv_caching.hpp"
 #include "ops/matmul.hpp"
 #include "ops/ones.hpp"
 #include "ops/paged_attention.hpp"
@@ -14,4 +17,5 @@
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
+#include "ops/silu_and_mul.hpp"
 #include "ops/swiglu.hpp"
@@ -1,17 +1,14 @@
 #pragma once
 
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
-class Add {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor);
-    static void execute(Tensor c, Tensor a, Tensor b);
-    static common::OpDispatcher<schema> &dispatcher();
-};
 
-Tensor add(Tensor a, Tensor b);
-void add_(Tensor c, Tensor a, Tensor b);
-Tensor operator+(Tensor a, Tensor b);
+INFINICORE_GRAPH_OP_CLASS(Add, Tensor, const Tensor &, const Tensor &);
+
+Tensor add(const Tensor &a, const Tensor &b);
+void add_(Tensor c, const Tensor &a, const Tensor &b);
+
 } // namespace infinicore::op