Skip to content

Commit 52f0dcf

Browse files
authored
Merge pull request #1019 from InfiniTensor/issue/1008
Issue/1008
2 parents d0f405c + 68026bd commit 52f0dcf

292 files changed

Lines changed: 453 additions & 312 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

scripts/python_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ def run_tests(args):
3939
"topkrouter.py",
4040
"topksoftmax.py",
4141
"zeros.py",
42+
# "paged_attention.py",
43+
# "paged_caching.py",
44+
# "paged_attention_prefill.py"
4245
]:
4346
result = subprocess.run(
4447
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True

src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,8 @@ infiniStatus_t Descriptor::calculate(
255255
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
256256
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
257257
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
258+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
259+
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
258260
} else {
259261
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
260262
}

src/infiniop/ops/layer_norm/operator.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) {
174174
#ifdef ENABLE_METAX_API
175175
DELETE(INFINI_DEVICE_METAX, metax);
176176
#endif
177+
#ifdef ENABLE_ILUVATAR_API
178+
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
179+
#endif
177180

178181
default:
179182
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

src/infiniop/ops/logsoftmax/nvidia/logsoftmax_nvidia.cu

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,11 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
117117
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
118118
_info.y_stride_b, _info.y_stride_p, _info.x_stride_b, _info.x_stride_p,
119119
_info.y_stride_0, _info.y_stride_1, _info.x_stride_0, _info.x_stride_1, stream));
120+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
121+
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
122+
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
123+
_info.y_stride_b, _info.y_stride_p, _info.x_stride_b, _info.x_stride_p,
124+
_info.y_stride_0, _info.y_stride_1, _info.x_stride_0, _info.x_stride_1, stream));
120125
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
121126
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
122127
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,

src/infiniop/ops/logsoftmax/operator.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateLogSoftmaxDescriptor(
4040
CREATE(INFINI_DEVICE_ALI, nvidia);
4141
#endif
4242
#ifdef ENABLE_ILUVATAR_API
43-
// CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
43+
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
4444
#endif
4545
#ifdef ENABLE_QY_API
4646
CREATE(INFINI_DEVICE_QY, nvidia);
@@ -73,7 +73,7 @@ __C infiniStatus_t infiniopGetLogSoftmaxWorkspaceSize(infiniopLogSoftmaxDescript
7373
GET(INFINI_DEVICE_ALI, nvidia);
7474
#endif
7575
#ifdef ENABLE_ILUVATAR_API
76-
// GET(INFINI_DEVICE_ILUVATAR, nvidia);
76+
GET(INFINI_DEVICE_ILUVATAR, nvidia);
7777
#endif
7878
#ifdef ENABLE_QY_API
7979
GET(INFINI_DEVICE_QY, nvidia);
@@ -111,7 +111,7 @@ __C infiniStatus_t infiniopLogSoftmax(
111111
CALCULATE(INFINI_DEVICE_ALI, nvidia);
112112
#endif
113113
#ifdef ENABLE_ILUVATAR_API
114-
// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
114+
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
115115
#endif
116116
#ifdef ENABLE_QY_API
117117
CALCULATE(INFINI_DEVICE_QY, nvidia);
@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopDestroyLogSoftmaxDescriptor(infiniopLogSoftmaxDescrip
144144
DESTROY(INFINI_DEVICE_ALI, nvidia);
145145
#endif
146146
#ifdef ENABLE_ILUVATAR_API
147-
// DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
147+
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
148148
#endif
149149
#ifdef ENABLE_QY_API
150150
DESTROY(INFINI_DEVICE_QY, nvidia);

src/infiniop/ops/lp_norm/nvidia/lp_norm_nvidia.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
155155
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
156156
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
157157
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
158+
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
159+
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
158160
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
159161
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
160162
} else {

src/infiniop/ops/paged_attention/cuda/kernel_v2.cuh

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,66 @@ struct OnlineSoftmaxState {
1616
}
1717
};
1818
__device__ __forceinline__ float warpReduceSum(float x) {
19+
#if defined(ENABLE_ILUVATAR_API)
20+
// Iluvatar may use warp size 64; __shfl_sync(0xffffffff) only covers 32 threads.
21+
// Use shared-memory tree reduce for portability across warp sizes.
22+
constexpr int kMaxWarps = 16;
23+
__shared__ float _reduce_buf[kMaxWarps * 32];
24+
const int lane = threadIdx.x & 31;
25+
const int warp_id = threadIdx.x / 32;
26+
_reduce_buf[threadIdx.x] = x;
27+
__syncthreads();
28+
for (int offset = 16; offset > 0; offset >>= 1) {
29+
if (lane < offset) {
30+
_reduce_buf[warp_id * 32 + lane] += _reduce_buf[warp_id * 32 + lane + offset];
31+
}
32+
__syncthreads();
33+
}
34+
return _reduce_buf[warp_id * 32];
35+
#else
1936
for (int offset = 16; offset > 0; offset >>= 1) {
2037
x += __shfl_down_sync(0xffffffff, x, offset);
2138
}
2239
return x;
40+
#endif
41+
}
42+
43+
__device__ __forceinline__ float warpBroadcast(float x, int src_lane) {
44+
#if defined(ENABLE_ILUVATAR_API)
45+
__shared__ float _bcast_buf[16];
46+
const int warp_id = threadIdx.x / 32;
47+
if ((threadIdx.x & 31) == src_lane) {
48+
_bcast_buf[warp_id] = x;
49+
}
50+
__syncthreads();
51+
return _bcast_buf[warp_id];
52+
#else
53+
return __shfl_sync(0xffffffff, x, src_lane);
54+
#endif
2355
}
2456

2557
__device__ __forceinline__ float warpReduceMax(float x) {
58+
#if defined(ENABLE_ILUVATAR_API)
59+
__shared__ float _reduce_buf[16 * 32];
60+
const int lane = threadIdx.x & 31;
61+
const int warp_id = threadIdx.x / 32;
62+
_reduce_buf[threadIdx.x] = x;
63+
__syncthreads();
64+
for (int offset = 16; offset > 0; offset >>= 1) {
65+
if (lane < offset) {
66+
float other = _reduce_buf[warp_id * 32 + lane + offset];
67+
float cur = _reduce_buf[warp_id * 32 + lane];
68+
_reduce_buf[warp_id * 32 + lane] = fmaxf(cur, other);
69+
}
70+
__syncthreads();
71+
}
72+
return _reduce_buf[warp_id * 32];
73+
#else
2674
for (int offset = 16; offset > 0; offset >>= 1) {
2775
x = fmaxf(x, __shfl_down_sync(0xffffffff, x, offset));
2876
}
2977
return x;
78+
#endif
3079
}
3180

3281
__device__ __forceinline__ unsigned int cvtaToShared(const void *ptr) {

src/infiniop/ops/paged_attention_prefill/cuda/kernel_v2.cuh

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#ifndef __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
22
#define __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
33

4-
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API)
4+
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
55
#include <cuda_bf16.h>
66
#include <cuda_fp16.h>
77
#include <cuda_runtime.h>
@@ -194,8 +194,8 @@ __device__ void PagedAttentionPrefillWarpKernel(
194194
l = l * alpha + beta;
195195
m = m_new;
196196
}
197-
alpha = __shfl_sync(0xffffffff, alpha, 0);
198-
beta = __shfl_sync(0xffffffff, beta, 0);
197+
alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
198+
beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
199199

200200
#if defined(__CUDA_ARCH__)
201201
if constexpr (std::is_same_v<Tdata, half>) {
@@ -233,7 +233,7 @@ __device__ void PagedAttentionPrefillWarpKernel(
233233
if (lane == 0) {
234234
inv_l = 1.0f / (l + 1e-6f);
235235
}
236-
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
236+
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
237237

238238
#pragma unroll
239239
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -411,8 +411,8 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
411411
l = l * alpha + beta;
412412
m = m_new;
413413
}
414-
alpha = __shfl_sync(0xffffffff, alpha, 0);
415-
beta = __shfl_sync(0xffffffff, beta, 0);
414+
alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
415+
beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
416416

417417
#if defined(__CUDA_ARCH__)
418418
if constexpr (std::is_same_v<Tdata, half>) {
@@ -450,7 +450,11 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
450450
if (lane == 0) {
451451
inv_l = 1.0f / (l + 1e-6f);
452452
}
453+
#ifdef ENABLE_ILUVATAR_API
454+
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
455+
#else
453456
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
457+
#endif
454458

455459
#pragma unroll
456460
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -785,8 +789,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
785789
l = l * alpha + beta;
786790
m = m_new;
787791
}
788-
alpha = __shfl_sync(0xffffffff, alpha, 0);
789-
beta = __shfl_sync(0xffffffff, beta, 0);
792+
alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
793+
beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
790794

791795
#if defined(__CUDA_ARCH__)
792796
if constexpr (std::is_same_v<Tdata, half>) {
@@ -826,7 +830,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
826830
if (lane == 0) {
827831
inv_l = 1.0f / (l + 1e-6f);
828832
}
829-
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
833+
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
830834

831835
#pragma unroll
832836
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -1270,7 +1274,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelPipelined(
12701274
if (lane == 0) {
12711275
inv_l = 1.0f / (l + 1e-6f);
12721276
}
1273-
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
1277+
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
12741278

12751279
#pragma unroll
12761280
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -1961,8 +1965,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
19611965
l = l * alpha + beta;
19621966
m = m_new;
19631967
}
1964-
alpha = __shfl_sync(0xffffffff, alpha, 0);
1965-
beta = __shfl_sync(0xffffffff, beta, 0);
1968+
alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
1969+
beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
19661970

19671971
#if defined(__CUDA_ARCH__)
19681972
if constexpr (std::is_same_v<Tdata, half>) {
@@ -2002,7 +2006,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
20022006
if (lane == 0) {
20032007
inv_l = 1.0f / (l + 1e-6f);
20042008
}
2005-
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
2009+
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
20062010

20072011
#pragma unroll
20082012
for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -2131,7 +2135,7 @@ __device__ __forceinline__ void PagedAttentionPrefillMmaScoreWriteRow(
21312135
if (lane == 0) {
21322136
inv_l = 1.0f / (l + 1e-6f);
21332137
}
2134-
inv_l = __shfl_sync(0xffffffff, inv_l, 0);
2138+
inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
21352139

21362140
const int64_t q_token = q_start + static_cast<int64_t>(q_token_local);
21372141
half *out_ptr = out_ + q_token * o_stride + static_cast<int64_t>(head_idx) * o_head_stride;

src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ constexpr size_t ceilDiv(size_t a, size_t b) {
2121
}
2222

2323
inline const char *default_prefill_kernel(const PagedAttentionPrefillInfo &info) {
24+
// Iluvatar: use warp (stable). Users can override via INFINIOP_FLASH_PREFILL_KERNEL.
25+
#ifdef ENABLE_ILUVATAR_API
26+
(void)info;
27+
return "warp";
28+
#endif
2429
// Heuristic auto-dispatch (v0.4):
2530
// - Prefer the pipelined + tile-wise softmax kernel on FA2-compatible block_size=256.
2631
// - Keep a conservative fallback for other shapes / older GPUs (cp.async is a no-op below SM80).

src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
#define ARRAY_TYPE_SIZE size_t
99

1010
// 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改
11-
#define MAX_BLOCK_ARRAY_SIZE 5
12-
#define MAX_GRID_ARRAY_SIZE 5
11+
#define MAX_BLOCK_ARRAY_SIZE 6
12+
#define MAX_GRID_ARRAY_SIZE 6
1313

1414
template <int ArrSize, typename ArrayType>
1515
struct ArrayStruct {
@@ -185,32 +185,43 @@ struct Constraint {
185185
DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
186186

187187
// 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改
188-
// 为1-5和1-5的所有组合生成内核
188+
// 为1-6和1-6的所有组合生成内核
189189
DEFINE_KERNELS_BY_CONSTRAINT(1, 1)
190190
DEFINE_KERNELS_BY_CONSTRAINT(1, 2)
191191
DEFINE_KERNELS_BY_CONSTRAINT(1, 3)
192192
DEFINE_KERNELS_BY_CONSTRAINT(1, 4)
193193
DEFINE_KERNELS_BY_CONSTRAINT(1, 5)
194+
DEFINE_KERNELS_BY_CONSTRAINT(1, 6)
194195
DEFINE_KERNELS_BY_CONSTRAINT(2, 1)
195196
DEFINE_KERNELS_BY_CONSTRAINT(2, 2)
196197
DEFINE_KERNELS_BY_CONSTRAINT(2, 3)
197198
DEFINE_KERNELS_BY_CONSTRAINT(2, 4)
198199
DEFINE_KERNELS_BY_CONSTRAINT(2, 5)
200+
DEFINE_KERNELS_BY_CONSTRAINT(2, 6)
199201
DEFINE_KERNELS_BY_CONSTRAINT(3, 1)
200202
DEFINE_KERNELS_BY_CONSTRAINT(3, 2)
201203
DEFINE_KERNELS_BY_CONSTRAINT(3, 3)
202204
DEFINE_KERNELS_BY_CONSTRAINT(3, 4)
203205
DEFINE_KERNELS_BY_CONSTRAINT(3, 5)
206+
DEFINE_KERNELS_BY_CONSTRAINT(3, 6)
204207
DEFINE_KERNELS_BY_CONSTRAINT(4, 1)
205208
DEFINE_KERNELS_BY_CONSTRAINT(4, 2)
206209
DEFINE_KERNELS_BY_CONSTRAINT(4, 3)
207210
DEFINE_KERNELS_BY_CONSTRAINT(4, 4)
208211
DEFINE_KERNELS_BY_CONSTRAINT(4, 5)
212+
DEFINE_KERNELS_BY_CONSTRAINT(4, 6)
209213
DEFINE_KERNELS_BY_CONSTRAINT(5, 1)
210214
DEFINE_KERNELS_BY_CONSTRAINT(5, 2)
211215
DEFINE_KERNELS_BY_CONSTRAINT(5, 3)
212216
DEFINE_KERNELS_BY_CONSTRAINT(5, 4)
213217
DEFINE_KERNELS_BY_CONSTRAINT(5, 5)
218+
DEFINE_KERNELS_BY_CONSTRAINT(5, 6)
219+
DEFINE_KERNELS_BY_CONSTRAINT(6, 1)
220+
DEFINE_KERNELS_BY_CONSTRAINT(6, 2)
221+
DEFINE_KERNELS_BY_CONSTRAINT(6, 3)
222+
DEFINE_KERNELS_BY_CONSTRAINT(6, 4)
223+
DEFINE_KERNELS_BY_CONSTRAINT(6, 5)
224+
DEFINE_KERNELS_BY_CONSTRAINT(6, 6)
214225

215226
// 准备参数结构体
216227
struct RearrangeParams {
@@ -294,6 +305,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
294305
case 5: \
295306
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
296307
break; \
308+
case 6: \
309+
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 6); \
310+
break; \
297311
}
298312

299313
#define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \
@@ -313,6 +327,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
313327
case 5: \
314328
GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
315329
break; \
330+
case 6: \
331+
GET_REARRANGE_KERNEL_BY_GRID_NUM(6); \
332+
break; \
316333
}
317334

318335
GET_REARRANGE_KERNEL_BY_BLOCK_NUM

0 commit comments

Comments
 (0)