Skip to content

Commit 120d746

Browse files
committed
Issue/887 - Add pow,div,mod,min,max operator with CPU and NVIDIA implementations.
1 parent 7c97894 commit 120d746

42 files changed

Lines changed: 2956 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

include/infiniop.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,14 @@
99
#include "infiniop/ops/clip.h"
1010
#include "infiniop/ops/conv.h"
1111
#include "infiniop/ops/dequantize_awq.h"
12+
#include "infiniop/ops/div.h"
1213
#include "infiniop/ops/gelu.h"
1314
#include "infiniop/ops/gemm.h"
1415
#include "infiniop/ops/layer_norm.h"
1516
#include "infiniop/ops/logsoftmax.h"
1617
#include "infiniop/ops/lp_norm.h"
18+
#include "infiniop/ops/max.h"
19+
#include "infiniop/ops/min.h"
1720
#include "infiniop/ops/mul.h"
1821
#include "infiniop/ops/ones.h"
1922
#include "infiniop/ops/paged_attention.h"

include/infiniop/ops/div.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#ifndef __INFINIOP_DIV_API_H__
2+
#define __INFINIOP_DIV_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopDivDescriptor_t;
7+
8+
__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
9+
infiniopDivDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t c,
11+
infiniopTensorDescriptor_t a,
12+
infiniopTensorDescriptor_t b);
13+
14+
__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size);
15+
16+
__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
17+
void *workspace,
18+
size_t workspace_size,
19+
void *c,
20+
const void *a,
21+
const void *b,
22+
void *stream);
23+
24+
__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
25+
26+
#endif

include/infiniop/ops/max.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#ifndef __INFINIOP_MAX_API_H__
2+
#define __INFINIOP_MAX_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t;
7+
8+
__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle,
9+
infiniopMaxDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t c,
11+
infiniopTensorDescriptor_t a,
12+
infiniopTensorDescriptor_t b);
13+
14+
__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size);
15+
16+
__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc,
17+
void *workspace,
18+
size_t workspace_size,
19+
void *c,
20+
const void *a,
21+
const void *b,
22+
void *stream);
23+
24+
__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc);
25+
26+
#endif

include/infiniop/ops/min.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#ifndef __INFINIOP_MIN_API_H__
2+
#define __INFINIOP_MIN_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopMinDescriptor_t;
7+
8+
__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle,
9+
infiniopMinDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t c,
11+
infiniopTensorDescriptor_t a,
12+
infiniopTensorDescriptor_t b);
13+
14+
__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size);
15+
16+
__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc,
17+
void *workspace,
18+
size_t workspace_size,
19+
void *c,
20+
const void *a,
21+
const void *b,
22+
void *stream);
23+
24+
__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc);
25+
26+
#endif

include/infiniop/ops/mod.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#ifndef __INFINIOP_MOD_API_H__
2+
#define __INFINIOP_MOD_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopModDescriptor_t;
7+
8+
__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle,
9+
infiniopModDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t c,
11+
infiniopTensorDescriptor_t a,
12+
infiniopTensorDescriptor_t b);
13+
14+
__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size);
15+
16+
__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc,
17+
void *workspace,
18+
size_t workspace_size,
19+
void *c,
20+
const void *a,
21+
const void *b,
22+
void *stream);
23+
24+
__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc);
25+
26+
#endif

include/infiniop/ops/pow.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#ifndef __INFINIOP_POW_API_H__
2+
#define __INFINIOP_POW_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopPowDescriptor_t;
7+
8+
__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle,
9+
infiniopPowDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t c,
11+
infiniopTensorDescriptor_t a,
12+
infiniopTensorDescriptor_t b);
13+
14+
__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size);
15+
16+
__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc,
17+
void *workspace,
18+
size_t workspace_size,
19+
void *c,
20+
const void *a,
21+
const void *b,
22+
void *stream);
23+
24+
__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc);
25+
26+
#endif
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#include "div_cpu.h"
2+
3+
namespace op::div::cpu {
4+
5+
Descriptor::~Descriptor() = default;
6+
7+
infiniStatus_t Descriptor::create(
8+
infiniopHandle_t handle_,
9+
Descriptor **desc_ptr,
10+
infiniopTensorDescriptor_t out_desc,
11+
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
12+
13+
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
14+
auto dtype = out_desc->dtype();
15+
16+
const auto &a_desc = input_desc_vec.at(0);
17+
const auto &b_desc = input_desc_vec.at(1);
18+
const auto &c_shape = out_desc->shape();
19+
const auto &a_shape = a_desc->shape();
20+
const auto &b_shape = b_desc->shape();
21+
22+
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
23+
24+
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
25+
26+
// create CPU elementwise descriptor
27+
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
28+
29+
return INFINI_STATUS_SUCCESS;
30+
}
31+
32+
infiniStatus_t Descriptor::calculate(
33+
void *workspace,
34+
size_t workspace_size,
35+
void *output,
36+
std::vector<const void *> inputs,
37+
void *stream) const {
38+
39+
switch (_dtype) {
40+
case INFINI_DTYPE_F16:
41+
return _device_info->calculate<DivOp, fp16_t>(_info, output, inputs, stream);
42+
case INFINI_DTYPE_F32:
43+
return _device_info->calculate<DivOp, float>(_info, output, inputs, stream);
44+
default:
45+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
46+
}
47+
48+
return INFINI_STATUS_SUCCESS;
49+
}
50+
} // namespace op::div::cpu

src/infiniop/ops/div/cpu/div_cpu.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#ifndef __DIV_CPU_H__
2+
#define __DIV_CPU_H__
3+
4+
#include "../../../elementwise/cpu/elementwise_cpu.h"
5+
6+
ELEMENTWISE_DESCRIPTOR(div, cpu)
7+
8+
namespace op::div::cpu {
9+
typedef struct DivOp {
10+
public:
11+
static constexpr size_t num_inputs = 2;
12+
template <typename T>
13+
T operator()(const T &a, const T &b) const {
14+
return a / b;
15+
}
16+
} DivOp;
17+
} // namespace op::div::cpu
18+
19+
#endif // __DIV_CPU_H__
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#ifndef __DIV_CUDA_H__
2+
#define __DIV_CUDA_H__
3+
4+
namespace op::div::cuda {
5+
typedef struct DivOp {
6+
public:
7+
static constexpr size_t num_inputs = 2;
8+
template <typename T>
9+
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
10+
if constexpr (std::is_same_v<T, half2>) {
11+
return __h2div(a, b);
12+
} else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
13+
return a / b;
14+
} else if constexpr (std::is_same_v<T, float>) {
15+
return __fdividef(a, b);
16+
} else {
17+
return a / b;
18+
}
19+
}
20+
} DivOp;
21+
} // namespace op::div::cuda
22+
23+
#endif // __DIV_CUDA_H__
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
2+
3+
#include "../cuda/kernel.cuh"
4+
#include "div_nvidia.cuh"
5+
6+
namespace op::div::nvidia {
7+
8+
Descriptor::~Descriptor() = default;
9+
10+
infiniStatus_t Descriptor::create(
11+
infiniopHandle_t handle_,
12+
Descriptor **desc_ptr,
13+
infiniopTensorDescriptor_t out_desc,
14+
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
15+
16+
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
17+
auto dtype = out_desc->dtype();
18+
19+
const auto &a_desc = input_desc_vec.at(0);
20+
const auto &b_desc = input_desc_vec.at(1);
21+
const auto &c_shape = out_desc->shape();
22+
const auto &a_shape = a_desc->shape();
23+
const auto &b_shape = b_desc->shape();
24+
25+
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
26+
27+
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
28+
29+
// create CUDA elementwise descriptor
30+
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
31+
32+
return INFINI_STATUS_SUCCESS;
33+
}
34+
35+
infiniStatus_t Descriptor::calculate(
36+
void *workspace,
37+
size_t workspace_size,
38+
void *output,
39+
std::vector<const void *> inputs,
40+
void *stream) const {
41+
42+
if (workspace_size < _workspace_size) {
43+
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
44+
}
45+
46+
switch (_dtype) {
47+
case INFINI_DTYPE_F16:
48+
return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
49+
case INFINI_DTYPE_F32:
50+
return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
51+
default:
52+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
53+
}
54+
55+
return INFINI_STATUS_SUCCESS;
56+
}
57+
} // namespace op::div::nvidia

0 commit comments

Comments
 (0)