Skip to content

Commit 829d293

Browse files
committed
Merge remote-tracking branch 'upstream/main' into 2025-autumn-PPPoint-t-T1-1-4
2 parents 2ae3b59 + 148b475 commit 829d293

146 files changed

Lines changed: 10947 additions & 1426 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

include/infinicore/common/hash.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "../tensor.hpp"
44

5+
#include <optional>
56
#include <type_traits>
67

78
namespace infinicore {
@@ -24,6 +25,15 @@ inline void hash_combine(size_t &seed, Tensor tensor) {
2425
}
2526
}
2627

28+
// Specialization for optional
29+
template <typename T>
30+
inline void hash_combine(size_t &seed, const std::optional<T> &opt) {
31+
hash_combine(seed, opt.has_value());
32+
if (opt) {
33+
hash_combine(seed, *opt);
34+
}
35+
}
36+
2737
// Specialization for std::string
2838
inline void hash_combine(size_t &seed, const std::string &str) {
2939
hash_combine(seed, std::hash<std::string>{}(str));

include/infinicore/context/context.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include "../device.hpp"
44
#include "../memory.hpp"
55

6+
#include "../graph/graph.hpp"
7+
68
#include <infiniop.h>
79
#include <infinirt.h>
810

@@ -40,6 +42,12 @@ void destroyEvent(infinirtEvent_t event);
4042
float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
4143
void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);
4244

45+
// Graph recording APIs
46+
bool isGraphRecording();
47+
void startGraphRecording();
48+
void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
49+
std::shared_ptr<graph::Graph> stopGraphRecording();
50+
4351
} // namespace context
4452

4553
} // namespace infinicore

include/infinicore/graph/graph.hpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#pragma once
2+
3+
#include <memory>
4+
#include <vector>
5+
6+
#include "../tensor.hpp"
7+
8+
namespace infinicore::graph {
9+
// Forward declarations
10+
class GraphManager;
11+
12+
class GraphTensor : public Tensor {
13+
public:
14+
GraphTensor(const Tensor &);
15+
void resume() const;
16+
};
17+
18+
class GraphOperator {
19+
20+
public:
21+
void run() const;
22+
~GraphOperator();
23+
24+
protected:
25+
using run_schema = void (*)(void *);
26+
using cleanup_schema = void (*)(void **);
27+
void *planned_meta_;
28+
run_schema runner_;
29+
cleanup_schema deleter_;
30+
};
31+
32+
class Graph {
33+
public:
34+
Graph() = default;
35+
~Graph() = default;
36+
37+
void run() const;
38+
39+
protected:
40+
void add_operator(std::shared_ptr<GraphOperator> op);
41+
42+
std::vector<std::shared_ptr<GraphOperator>> op_list_;
43+
44+
friend class GraphManager;
45+
};
46+
} // namespace infinicore::graph
47+
48+
#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...) \
49+
class __OP_NAME__ : public graph::GraphOperator { \
50+
public: \
51+
using schema = void (*)(__VA_ARGS__); \
52+
using plan_schema = void *(*)(__VA_ARGS__); \
53+
static common::OpDispatcher<plan_schema> &plan_dispatcher(); \
54+
static common::OpDispatcher<run_schema> &run_dispatcher(); \
55+
static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher(); \
56+
__OP_NAME__(__VA_ARGS__); \
57+
static void execute(__VA_ARGS__); \
58+
};
59+
60+
#define INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(__OP_NAME__) \
61+
common::OpDispatcher<__OP_NAME__::plan_schema> &__OP_NAME__::plan_dispatcher() { \
62+
static common::OpDispatcher<__OP_NAME__::plan_schema> dispatcher_; \
63+
return dispatcher_; \
64+
} \
65+
common::OpDispatcher<__OP_NAME__::run_schema> &__OP_NAME__::run_dispatcher() { \
66+
static common::OpDispatcher<__OP_NAME__::run_schema> dispatcher_; \
67+
return dispatcher_; \
68+
} \
69+
common::OpDispatcher<__OP_NAME__::cleanup_schema> &__OP_NAME__::cleanup_dispatcher() { \
70+
static common::OpDispatcher<__OP_NAME__::cleanup_schema> dispatcher_; \
71+
return dispatcher_; \
72+
}
73+
74+
#define INFINICORE_GRAPH_OP_DISPATCH(__DEVICE_TYPE__, ...) \
75+
planned_meta_ = plan_dispatcher().lookup(__DEVICE_TYPE__)(__VA_ARGS__); \
76+
runner_ = run_dispatcher().lookup(__DEVICE_TYPE__); \
77+
deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
78+
79+
#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
80+
auto op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \
81+
if (context::isGraphRecording()) { \
82+
context::addGraphOperator(op); \
83+
} else { \
84+
op->run(); \
85+
}
86+
87+
#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
88+
static bool registered = []() { \
89+
__OP_NAME__::plan_dispatcher().registerAll(__PLAN_F__, false); \
90+
__OP_NAME__::run_dispatcher().registerAll(__RUN_F__, false); \
91+
__OP_NAME__::cleanup_dispatcher().registerAll(__CLEANUP_F__, false); \
92+
return true; \
93+
}();

include/infinicore/nn/rope.hpp

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,47 @@ class RoPE : public Module {
1717
GPT_NEOX = 1, // GPT-NeoX style RoPE algorithm (First half dimensions for sin, second half for cos)
1818
};
1919

20+
enum class ScalingType {
21+
DEFAULT = 0, // Default RoPE
22+
LONGROPE = 1 // Long-RoPE
23+
};
24+
25+
class ScalingConfig {
26+
public:
27+
virtual ~ScalingConfig() = default;
28+
ScalingType type() const { return type_; }
29+
30+
protected:
31+
ScalingType type_ = ScalingType::DEFAULT;
32+
ScalingConfig(ScalingType type) : type_(type) {}
33+
};
34+
35+
// longrope scaling
36+
class LongRopeConfig : public ScalingConfig {
37+
protected:
38+
std::vector<float> short_factor_;
39+
std::vector<float> long_factor_;
40+
size_t original_max_position_embeddings_;
41+
float factor_;
42+
43+
public:
44+
LongRopeConfig(
45+
std::vector<float> short_factor,
46+
std::vector<float> long_factor,
47+
size_t original_max_position_embeddings,
48+
float factor = 1.0f)
49+
: ScalingConfig(ScalingType::LONGROPE),
50+
short_factor_(short_factor),
51+
long_factor_(long_factor),
52+
original_max_position_embeddings_(original_max_position_embeddings),
53+
factor_(factor == 1.0f ? 1.0f : std::sqrt(1 + std::log(factor) / std::log(original_max_position_embeddings))) {}
54+
~LongRopeConfig() override = default;
55+
size_t original_max_position_embeddings() const { return original_max_position_embeddings_; }
56+
const std::vector<float> &short_factor() const { return short_factor_; }
57+
const std::vector<float> &long_factor() const { return long_factor_; }
58+
float factor() const { return factor_; }
59+
};
60+
2061
/**
2162
* @brief Construct a RoPE layer
2263
*
@@ -26,13 +67,15 @@ class RoPE : public Module {
2667
* @param algo RoPE algorithm type (default: Algo::GPT_J)
2768
* @param dtype Data type for sin/cos cache (default: DataType::F32)
2869
* @param device Device to create the cache on
70+
* @param scaling RoPE scaling type (default: nullptr)
2971
*/
3072
RoPE(size_t head_dim,
3173
size_t max_seq_len,
3274
double theta = 10000.0,
3375
Algo algo = Algo::GPT_J,
3476
const DataType &dtype = DataType::F32,
35-
const Device &device = Device());
77+
const Device &device = Device(),
78+
std::shared_ptr<ScalingConfig> scaling = nullptr);
3679

3780
/**
3881
* @brief Forward pass: apply RoPE to a tensor
@@ -88,11 +131,12 @@ class RoPE : public Module {
88131
private:
89132
void initialize_cache();
90133

91-
size_t head_dim_; // Dimension of each attention head
92-
size_t max_seq_len_; // Maximum sequence length
93-
double theta_; // Base frequency for rotary embeddings
94-
Algo algo_; // RoPE algorithm type
95-
DataType dtype_; // Data type for cache tables
134+
size_t head_dim_; // Dimension of each attention head
135+
size_t max_seq_len_; // Maximum sequence length
136+
double theta_; // Base frequency for rotary embeddings
137+
Algo algo_; // RoPE algorithm type
138+
DataType dtype_; // Data type for cache tables
139+
std::shared_ptr<ScalingConfig> scaling_; // RoPE scaling type
96140
};
97141

98142
} // namespace infinicore::nn

include/infinicore/ops.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
#pragma once
22

33
#include "ops/add.hpp"
4+
#include "ops/add_rms_norm.hpp"
45
#include "ops/attention.hpp"
56
#include "ops/causal_softmax.hpp"
67
#include "ops/matmul.hpp"
78
#include "ops/ones.hpp"
9+
#include "ops/paged_attention.hpp"
10+
#include "ops/paged_attention_prefill.hpp"
11+
#include "ops/paged_caching.hpp"
12+
#include "ops/random_sample.hpp"
813
#include "ops/rearrange.hpp"
914
#include "ops/rms_norm.hpp"
1015
#include "ops/rope.hpp"
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#pragma once
2+
3+
#include "../device.hpp"
4+
#include "common/op.hpp"
5+
#include <utility>
6+
7+
namespace infinicore::op {
8+
class AddRMSNorm {
9+
public:
10+
using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, float);
11+
static void execute(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
12+
static common::OpDispatcher<schema> &dispatcher();
13+
};
14+
15+
// Fused Add and RMS Normalization
16+
// Returns: (normalized_result, add_result)
17+
// The add_result can be used as residual for subsequent layers
18+
std::pair<Tensor, Tensor> add_rms_norm(Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
19+
void add_rms_norm_(Tensor y, Tensor residual_out, Tensor a, Tensor b, Tensor weight, float epsilon = 1e-5f);
20+
} // namespace infinicore::op

include/infinicore/ops/gemm.hpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
11
#pragma once
22

33
#include "../device.hpp"
4+
#include "../graph/graph.hpp"
45
#include "common/op.hpp"
56

67
namespace infinicore::op {
78

8-
class Gemm {
9-
public:
10-
using schema = void (*)(Tensor, Tensor, Tensor, float, float);
11-
static void execute(Tensor c, Tensor a, Tensor b, float alpha, float beta);
12-
static common::OpDispatcher<schema> &dispatcher();
13-
};
9+
INFINICORE_GRAPH_OP_CLASS(Gemm, Tensor, Tensor, Tensor, float, float);
1410

1511
Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f);
1612
void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta);
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#pragma once
2+
3+
#include "../device.hpp"
4+
#include "common/op.hpp"
5+
#include <optional>
6+
7+
namespace infinicore::op {
8+
9+
class PagedAttention {
10+
public:
11+
using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
12+
static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float);
13+
static common::OpDispatcher<schema> &dispatcher();
14+
};
15+
16+
Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale);
17+
void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor cache_lens, std::optional<Tensor> alibi_slopes, float scale);
18+
} // namespace infinicore::op
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#pragma once
2+
3+
#include "../device.hpp"
4+
#include "common/op.hpp"
5+
#include <optional>
6+
7+
namespace infinicore::op {
8+
9+
class PagedAttentionPrefill {
10+
public:
11+
/**
12+
* @brief PagedAttentionPrefill operator signature
13+
* * Argument order:
14+
* 1. out: Output tensor (Packed format)
15+
* 2. q: Current Query tensor (Packed format)
16+
* 3. k_cache: Physical Key cache (Paged format)
17+
* 4. v_cache: Physical Value cache (Paged format)
18+
* 5. block_tables: Mapping table from logical blocks to physical blocks
19+
* 6. total_kv_lens: lengths of Complete Key/Value for each request
20+
* 7. cu_seqlens_q: Cumulative sequence lengths of Query (prefix sum for variable-length batch)
21+
* 8. alibi_slopes: ALiBi bias slopes (optional)
22+
* 9. scale: Scaling factor (typically 1/sqrt(head_size))
23+
*/
24+
using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, std::optional<Tensor>, float);
25+
26+
static void execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
27+
Tensor block_tables, Tensor total_kv_lens, Tensor cum_seqlens_q,
28+
std::optional<Tensor> alibi_slopes, float scale);
29+
30+
static common::OpDispatcher<schema> &dispatcher();
31+
};
32+
33+
Tensor paged_attention_prefill(Tensor q,
34+
Tensor k_cache,
35+
Tensor v_cache,
36+
Tensor block_tables,
37+
Tensor total_kv_lens,
38+
Tensor cum_seqlens_q,
39+
std::optional<Tensor> alibi_slopes,
40+
float scale);
41+
42+
void paged_attention_prefill_(Tensor out,
43+
Tensor q,
44+
Tensor k_cache,
45+
Tensor v_cache,
46+
Tensor block_tables,
47+
Tensor total_kv_lens,
48+
Tensor cum_seqlens_q,
49+
std::optional<Tensor> alibi_slopes,
50+
float scale);
51+
52+
} // namespace infinicore::op
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#pragma once
2+
3+
#include "../device.hpp"
4+
#include "common/op.hpp"
5+
6+
namespace infinicore::op {
7+
8+
class PagedCaching {
9+
public:
10+
using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor);
11+
static void execute(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping);
12+
static common::OpDispatcher<schema> &dispatcher();
13+
};
14+
15+
void paged_caching_(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping);
16+
17+
} // namespace infinicore::op

0 commit comments

Comments
 (0)