Skip to content

Commit 784139b

Browse files
authored
Merge pull request #990 from InfiniTensor/demo131
Demo-131 Cuda graph with optimized paged attention
2 parents 3c8fb3c + 1d6527c commit 784139b

582 files changed

Lines changed: 24359 additions & 2060 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitmodules

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
[submodule "third_party/spdlog"]
22
path = third_party/spdlog
33
url = https://github.com/gabime/spdlog.git
4+
[submodule "third_party/nlohmann_json"]
5+
path = third_party/nlohmann_json
6+
url = https://github.com/nlohmann/json.git
7+
branch = master

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功
2020
- 天数智芯 GPU;
2121
- 沐曦 GPU;
2222
- 海光 DCU;
23+
- 阿里 PPU;
2324
- 华为昇腾 NPU;
2425
- 寒武纪 MLU;
2526
- 昆仑芯 XPU;
@@ -103,6 +104,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
103104
| `--qy-gpu=[y\|n]` | 是否编译QY GPU 接口实现 | n
104105
| `--hygon-dcu=[y\|n]` | 是否编译海光 DCU 接口实现 | n
105106
| `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n
107+
| `--ali-ppu=[y\|n]` | 是否编译阿里 PPU 接口实现 | n
106108
| `--ninetoothed=[y\|n]` | 是否编译九齿实现 | n
107109
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
108110

@@ -187,9 +189,9 @@ pip install -e .
187189

188190
```bash
189191
# 测试单算子
190-
python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
192+
python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
191193
# 测试全部算子
192-
python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun]
194+
python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --ali]
193195
```
194196

195197
使用 -h 查看更多参数。
@@ -198,9 +200,9 @@ python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia
198200

199201
```shell
200202
# 测试单算子
201-
python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
203+
python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
202204
# 测试全部算子
203-
python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
205+
python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon | --ali]
204206
```
205207

206208
#### 通信库(InfiniCCL)测试

include/infinicore.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ typedef enum {
4747
INFINI_DEVICE_KUNLUN = 7,
4848
INFINI_DEVICE_HYGON = 8,
4949
INFINI_DEVICE_QY = 9,
50+
INFINI_DEVICE_ALI = 10,
5051
INFINI_DEVICE_TYPE_COUNT
5152
} infiniDevice_t;
5253

include/infinicore.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
#include "infinicore/device_event.hpp"
44
#include "infinicore/nn.hpp"
55
#include "infinicore/ops.hpp"
6+
#include "infinicore/quantization.hpp"
67
#include "infinicore/tensor.hpp"

include/infinicore/device.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class Device {
2222
KUNLUN = INFINI_DEVICE_KUNLUN,
2323
HYGON = INFINI_DEVICE_HYGON,
2424
QY = INFINI_DEVICE_QY,
25+
ALI = INFINI_DEVICE_ALI,
2526
COUNT = INFINI_DEVICE_TYPE_COUNT,
2627
};
2728

include/infinicore/graph/graph.hpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,15 @@ class GraphTensor : public Tensor {
1515
};
1616

1717
class GraphOperator {
18+
public:
19+
virtual void run() const = 0;
20+
virtual ~GraphOperator() = default;
21+
};
1822

23+
class DispatchableGraphOperator : public GraphOperator {
1924
public:
20-
void run() const;
21-
~GraphOperator();
25+
void run() const override;
26+
~DispatchableGraphOperator() override;
2227

2328
protected:
2429
using run_schema = void (*)(void *);
@@ -49,7 +54,7 @@ class Graph {
4954
} // namespace infinicore::graph
5055

5156
#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...) \
52-
class __OP_NAME__ : public graph::GraphOperator { \
57+
class __OP_NAME__ : public graph::DispatchableGraphOperator { \
5358
public: \
5459
using schema = void (*)(__VA_ARGS__); \
5560
using plan_schema = void *(*)(__VA_ARGS__); \
@@ -79,12 +84,12 @@ class Graph {
7984
runner_ = run_dispatcher().lookup(__DEVICE_TYPE__); \
8085
deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
8186

82-
#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
83-
auto op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \
84-
if (context::isGraphRecording()) { \
85-
context::addGraphOperator(op); \
86-
} else { \
87-
op->run(); \
87+
#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
88+
auto ___op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \
89+
if (context::isGraphRecording()) { \
90+
context::addGraphOperator(___op); \
91+
} else { \
92+
___op->run(); \
8893
}
8994

9095
#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \

include/infinicore/nn/linear.hpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#pragma once
22

33
#include "../ops.hpp"
4+
#include "../quantization.hpp"
45
#include "module.hpp"
56
#include <infiniccl.h>
7+
#include <optional>
68

79
namespace infinicore::nn {
810

@@ -11,6 +13,9 @@ class BaseLinear : public Module {
1113
BaseLinear(size_t in_features, size_t out_features, bool bias = true,
1214
const DataType &dtype = DataType::F32, const Device &device = Device());
1315

16+
BaseLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
17+
const DataType &dtype = DataType::F32, const Device &device = Device());
18+
1419
// Forward pass: output = input @ weight.T + bias
1520
Tensor forward(Tensor &input) const;
1621

@@ -27,12 +32,17 @@ class BaseLinear : public Module {
2732
// Accessors for parameters
2833
Tensor weight() const { return weight_; }
2934
Tensor bias() const { return bias_; }
35+
Tensor weight_scale() const { return weight_scale_; }
36+
Tensor weight_zeros() const { return weight_zeros_; }
3037

3138
protected:
3239
// Parameters
3340
INFINICORE_NN_PARAMETER(weight);
3441
INFINICORE_NN_PARAMETER(bias);
3542

43+
INFINICORE_NN_PARAMETER(weight_scale);
44+
INFINICORE_NN_PARAMETER(weight_zeros);
45+
3646
protected:
3747
// Helper method for common forward computation
3848
Tensor compute_linear(Tensor &input) const;
@@ -41,6 +51,7 @@ class BaseLinear : public Module {
4151
size_t out_features_;
4252
bool has_bias_;
4353
DataType dtype_;
54+
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_ = std::make_shared<infinicore::quantization::NoneQuantization>(nullptr);
4455
};
4556

4657
} // namespace infinicore::nn
@@ -52,6 +63,9 @@ class Linear : public BaseLinear {
5263
Linear(size_t in_features, size_t out_features, bool bias = true,
5364
const DataType &dtype = DataType::F32, const Device &device = Device());
5465

66+
Linear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
67+
const DataType &dtype = DataType::F32, const Device &device = Device());
68+
5569
// Forward pass: output = input @ weight.T + bias
5670
Tensor forward(Tensor &input) const;
5771

@@ -65,6 +79,10 @@ class ColumnParallelLinear : public BaseLinear {
6579
const DataType &dtype = DataType::F32, const Device &device = Device(),
6680
Size tp_rank = 0, Size tp_size = 1);
6781

82+
ColumnParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
83+
const DataType &dtype = DataType::F32, const Device &device = Device(),
84+
Size tp_rank = 0, Size tp_size = 1);
85+
6886
// Forward pass: output = input @ weight.T + bias
6987
Tensor forward(Tensor &input) const;
7088

@@ -82,6 +100,10 @@ class RowParallelLinear : public BaseLinear {
82100
const DataType &dtype = DataType::F32, const Device &device = Device(),
83101
Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
84102

103+
RowParallelLinear(size_t in_features, size_t out_features, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias = true,
104+
const DataType &dtype = DataType::F32, const Device &device = Device(),
105+
Size tp_rank = 0, Size tp_size = 1, infinicclComm_t communicator = nullptr);
106+
85107
// Forward pass: output = input @ weight.T + bias
86108
Tensor forward(Tensor &input) const;
87109

include/infinicore/nn/rmsnorm.hpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#pragma once
22

3-
#include "module.hpp"
43
#include "../ops.hpp"
4+
#include "module.hpp"
55

66
namespace infinicore::nn {
77

@@ -57,6 +57,21 @@ class RMSNorm : public Module {
5757
*/
5858
Tensor forward(const Tensor &x) const;
5959

60+
/**
61+
* @brief Forward pass: apply RMSNorm in-place with residual
62+
*
63+
* @param x Input tensor of shape (*, normalized_shape) where * is any number of dimensions.
64+
* Will be modified in-place to the normalized output.
65+
* @param residual Residual tensor to add to input before normalization.
66+
* Will be modified in-place to the sum of input and residual.
67+
*
68+
* The normalization is applied over the last dimension.
69+
* For example:
70+
* Input: [batch, seq_len, hidden_size] -> normalize over hidden_size
71+
* Input: [batch, hidden_size] -> normalize over hidden_size
72+
*/
73+
void forward_inplace(Tensor &x, Tensor &residual) const;
74+
6075
// Module information
6176
size_t normalized_shape() const { return normalized_shape_; }
6277
double eps() const { return eps_; }
@@ -73,9 +88,9 @@ class RMSNorm : public Module {
7388
INFINICORE_NN_PARAMETER(weight);
7489

7590
private:
76-
size_t normalized_shape_; // Size of the feature dimension
77-
double eps_; // Epsilon for numerical stability
78-
DataType dtype_; // Data type for weight
91+
size_t normalized_shape_; // Size of the feature dimension
92+
double eps_; // Epsilon for numerical stability
93+
DataType dtype_; // Data type for weight
7994
};
8095

8196
} // namespace infinicore::nn

include/infinicore/ops.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
#include "ops/add_rms_norm.hpp"
55
#include "ops/attention.hpp"
66
#include "ops/causal_softmax.hpp"
7+
#include "ops/embedding.hpp"
8+
#include "ops/flash_attention.hpp"
9+
#include "ops/kv_caching.hpp"
710
#include "ops/matmul.hpp"
811
#include "ops/ones.hpp"
912
#include "ops/paged_attention.hpp"
@@ -14,4 +17,5 @@
1417
#include "ops/rms_norm.hpp"
1518
#include "ops/rope.hpp"
1619
#include "ops/silu.hpp"
20+
#include "ops/silu_and_mul.hpp"
1721
#include "ops/swiglu.hpp"

include/infinicore/ops/add.hpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
#pragma once
22

33
#include "../device.hpp"
4+
#include "../graph/graph.hpp"
45
#include "common/op.hpp"
56

67
namespace infinicore::op {
7-
class Add {
8-
public:
9-
using schema = void (*)(Tensor, Tensor, Tensor);
10-
static void execute(Tensor c, Tensor a, Tensor b);
11-
static common::OpDispatcher<schema> &dispatcher();
12-
};
138

14-
Tensor add(Tensor a, Tensor b);
15-
void add_(Tensor c, Tensor a, Tensor b);
16-
Tensor operator+(Tensor a, Tensor b);
9+
INFINICORE_GRAPH_OP_CLASS(Add, Tensor, const Tensor &, const Tensor &);
10+
11+
Tensor add(const Tensor &a, const Tensor &b);
12+
void add_(Tensor c, const Tensor &a, const Tensor &b);
13+
1714
} // namespace infinicore::op

0 commit comments

Comments
 (0)