From 6431be37f3522c74a193c5617dcd6602b4528c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=96=E6=B3=89laiquan?= <167113918+LaiQuan-conquer@users.noreply.github.com> Date: Tue, 7 Apr 2026 10:45:01 +0800 Subject: [PATCH 1/2] issue/1031 merge T1-1-41 --- include/infiniop.h | 5 + include/infiniop/ops/erf.h | 24 + include/infiniop/ops/erfc.h | 24 + include/infiniop/ops/erfinv.h | 24 + include/infiniop/ops/matrix_power.h | 25 + include/infiniop/ops/pixel_shuffle.h | 25 + src/infiniop/ops/erf/cpu/erf_cpu.cc | 52 ++ src/infiniop/ops/erf/cpu/erf_cpu.h | 20 + src/infiniop/ops/erf/cuda/kernel.cuh | 36 ++ src/infiniop/ops/erf/erf.h | 8 + src/infiniop/ops/erf/metax/erf_metax.h | 8 + src/infiniop/ops/erf/metax/erf_metax.maca | 60 ++ src/infiniop/ops/erf/moore/erf_moore.h | 8 + src/infiniop/ops/erf/moore/erf_moore.mu | 60 ++ src/infiniop/ops/erf/moore/erf_moore_kernel.h | 36 ++ src/infiniop/ops/erf/nvidia/erf_nvidia.cu | 58 ++ src/infiniop/ops/erf/nvidia/erf_nvidia.cuh | 8 + src/infiniop/ops/erf/operator.cc | 157 +++++ src/infiniop/ops/erfc/cpu/erfc_cpu.cc | 52 ++ src/infiniop/ops/erfc/cpu/erfc_cpu.h | 20 + src/infiniop/ops/erfc/cuda/kernel.cuh | 36 ++ src/infiniop/ops/erfc/erfc.h | 8 + src/infiniop/ops/erfc/metax/erfc_metax.h | 8 + src/infiniop/ops/erfc/metax/erfc_metax.maca | 60 ++ src/infiniop/ops/erfc/moore/erfc_moore.h | 8 + src/infiniop/ops/erfc/moore/erfc_moore.mu | 60 ++ .../ops/erfc/moore/erfc_moore_kernel.h | 36 ++ src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu | 58 ++ src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh | 8 + src/infiniop/ops/erfc/operator.cc | 157 +++++ src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc | 52 ++ src/infiniop/ops/erfinv/cpu/erfinv_cpu.h | 46 ++ src/infiniop/ops/erfinv/cuda/kernel.cuh | 113 ++++ src/infiniop/ops/erfinv/erfinv.h | 8 + src/infiniop/ops/erfinv/metax/erfinv_metax.h | 8 + .../ops/erfinv/metax/erfinv_metax.maca | 60 ++ src/infiniop/ops/erfinv/moore/erfinv_moore.h | 8 + src/infiniop/ops/erfinv/moore/erfinv_moore.mu | 60 ++ .../ops/erfinv/moore/erfinv_moore_kernel.h | 72 +++ .../ops/erfinv/nvidia/erfinv_nvidia.cu | 58 ++ .../ops/erfinv/nvidia/erfinv_nvidia.cuh | 8 + src/infiniop/ops/erfinv/operator.cc | 157 +++++ .../ops/matrix_power/cpu/matrix_power_cpu.cc | 201 +++++++ .../ops/matrix_power/cpu/matrix_power_cpu.h | 69 +++ .../matrix_power/metax/matrix_power_metax.h | 48 ++ .../metax/matrix_power_metax.maca | 107 ++++ .../matrix_power/moore/matrix_power_moore.h | 48 ++ .../matrix_power/moore/matrix_power_moore.mu | 106 ++++ .../nvidia/matrix_power_nvidia.cu | 548 ++++++++++++++++++ .../nvidia/matrix_power_nvidia.cuh | 72 +++ src/infiniop/ops/matrix_power/operator.cc | 159 +++++ .../pixel_shuffle/cpu/pixel_shuffle_cpu.cc | 137 +++++ .../ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h | 58 ++ .../ops/pixel_shuffle/cuda/kernel.cuh | 95 +++ .../pixel_shuffle/metax/pixel_shuffle_metax.h | 57 ++ .../metax/pixel_shuffle_metax.maca | 113 ++++ .../pixel_shuffle/moore/pixel_shuffle_moore.h | 57 ++ .../moore/pixel_shuffle_moore.mu | 113 ++++ .../nvidia/pixel_shuffle_nvidia.cu | 129 +++++ .../nvidia/pixel_shuffle_nvidia.cuh | 65 +++ src/infiniop/ops/pixel_shuffle/operator.cc | 159 +++++ 61 files changed, 4140 insertions(+) create mode 100644 include/infiniop/ops/erf.h create mode 100644 include/infiniop/ops/erfc.h create mode 100644 include/infiniop/ops/erfinv.h create mode 100644 include/infiniop/ops/matrix_power.h create mode 100644 include/infiniop/ops/pixel_shuffle.h create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.cc create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.h create mode 100644 src/infiniop/ops/erf/cuda/kernel.cuh create mode 100644 src/infiniop/ops/erf/erf.h create mode 100644 src/infiniop/ops/erf/metax/erf_metax.h create mode 100644 src/infiniop/ops/erf/metax/erf_metax.maca create mode 100644 src/infiniop/ops/erf/moore/erf_moore.h create mode 100644 src/infiniop/ops/erf/moore/erf_moore.mu create mode 100644 src/infiniop/ops/erf/moore/erf_moore_kernel.h create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cu create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh create mode 100644 src/infiniop/ops/erf/operator.cc create mode 100644 src/infiniop/ops/erfc/cpu/erfc_cpu.cc create mode 100644 src/infiniop/ops/erfc/cpu/erfc_cpu.h create mode 100644 src/infiniop/ops/erfc/cuda/kernel.cuh create mode 100644 src/infiniop/ops/erfc/erfc.h create mode 100644 src/infiniop/ops/erfc/metax/erfc_metax.h create mode 100644 src/infiniop/ops/erfc/metax/erfc_metax.maca create mode 100644 src/infiniop/ops/erfc/moore/erfc_moore.h create mode 100644 src/infiniop/ops/erfc/moore/erfc_moore.mu create mode 100644 src/infiniop/ops/erfc/moore/erfc_moore_kernel.h create mode 100644 src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu create mode 100644 src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh create mode 100644 src/infiniop/ops/erfc/operator.cc create mode 100644 src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc create mode 100644 src/infiniop/ops/erfinv/cpu/erfinv_cpu.h create mode 100644 src/infiniop/ops/erfinv/cuda/kernel.cuh create mode 100644 src/infiniop/ops/erfinv/erfinv.h create mode 100644 src/infiniop/ops/erfinv/metax/erfinv_metax.h create mode 100644 src/infiniop/ops/erfinv/metax/erfinv_metax.maca create mode 100644 src/infiniop/ops/erfinv/moore/erfinv_moore.h create mode 100644 src/infiniop/ops/erfinv/moore/erfinv_moore.mu create mode 100644 src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h create mode 100644 src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu create mode 100644 src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh create mode 100644 src/infiniop/ops/erfinv/operator.cc create mode 100644 src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc create mode 100644 src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h create mode 100644 src/infiniop/ops/matrix_power/metax/matrix_power_metax.h create mode 100644 src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca create mode 100644 src/infiniop/ops/matrix_power/moore/matrix_power_moore.h create mode 100644 src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu create mode 100644 src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu create mode 100644 src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh create mode 100644 src/infiniop/ops/matrix_power/operator.cc create mode 100644 src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc create mode 100644 src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h create mode 100644 src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh create mode 100644 src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h create mode 100644 src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca create mode 100644 src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h create mode 100644 src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu create mode 100644 src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu create mode 100644 src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh create mode 100644 src/infiniop/ops/pixel_shuffle/operator.cc diff --git a/include/infiniop.h b/include/infiniop.h index fe9537876..104476037 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -30,6 +30,9 @@ #include "infiniop/ops/dequantize_gptq.h" #include "infiniop/ops/embedding.h" #include "infiniop/ops/equal.h" +#include "infiniop/ops/erf.h" +#include "infiniop/ops/erfc.h" +#include "infiniop/ops/erfinv.h" #include "infiniop/ops/flash_attention.h" #include "infiniop/ops/flipud.h" #include "infiniop/ops/float_power.h" @@ -58,12 +61,14 @@ #include "infiniop/ops/logcumsumexp.h" #include "infiniop/ops/lp_norm.h" #include "infiniop/ops/masked_select.h" +#include "infiniop/ops/matrix_power.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/multi_margin_loss.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" #include "infiniop/ops/paged_attention_prefill.h" #include "infiniop/ops/paged_caching.h" +#include "infiniop/ops/pixel_shuffle.h" #include "infiniop/ops/quant/per_channel_quant_int8.h" #include "infiniop/ops/quant/per_tensor_quant_int8.h" #include "infiniop/ops/random_sample.h" diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h new file mode 100644 index 000000000..8cbb8fb74 --- /dev/null +++ b/include/infiniop/ops/erf.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ERF_API_H__ +#define __INFINIOP_ERF_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopErfDescriptor_t; + +__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/erfc.h b/include/infiniop/ops/erfc.h new file mode 100644 index 000000000..6454573bc --- /dev/null +++ b/include/infiniop/ops/erfc.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ERFC_API_H__ +#define __INFINIOP_ERFC_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopErfcDescriptor_t; + +__C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle, + infiniopErfcDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/erfinv.h b/include/infiniop/ops/erfinv.h new file mode 100644 index 000000000..79bc09f22 --- /dev/null +++ b/include/infiniop/ops/erfinv.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ERFINV_API_H__ +#define __INFINIOP_ERFINV_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopErfinvDescriptor_t; + +__C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle, + infiniopErfinvDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/matrix_power.h b/include/infiniop/ops/matrix_power.h new file mode 100644 index 000000000..acd7c0c7e --- /dev/null +++ b/include/infiniop/ops/matrix_power.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_MATRIX_POWER_API_H__ +#define __INFINIOP_MATRIX_POWER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMatrixPowerDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle, + infiniopMatrixPowerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int n); + +__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/pixel_shuffle.h b/include/infiniop/ops/pixel_shuffle.h new file mode 100644 index 000000000..941a91cfc --- /dev/null +++ b/include/infiniop/ops/pixel_shuffle.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_PIXEL_SHUFFLE_API_H__ +#define __INFINIOP_PIXEL_SHUFFLE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopPixelShuffleDescriptor_t; + +__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle, + infiniopPixelShuffleDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int upscale_factor); + +__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc new file mode 100644 index 000000000..7d127bfae --- /dev/null +++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc @@ -0,0 +1,52 @@ +#include "erf_cpu.h" + +namespace op::erf::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erf::cpu diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h new file mode 100644 index 000000000..74ad19e57 --- /dev/null +++ b/src/infiniop/ops/erf/cpu/erf_cpu.h @@ -0,0 +1,20 @@ +#ifndef __ERF_CPU_H__ +#define __ERF_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(erf, cpu) + +namespace op::erf::cpu { +typedef struct ErfOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return std::erf(x); + } +} ErfOp; +} // namespace op::erf::cpu + +#endif // __ERF_CPU_H__ diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh new file mode 100644 index 000000000..9bd6cff21 --- /dev/null +++ b/src/infiniop/ops/erf/cuda/kernel.cuh @@ -0,0 +1,36 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace op::cuda { + +struct ErfOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(T x) const { + if constexpr (std::is_same_v) { + return erff(x); + } else if constexpr (std::is_same_v) { + return erf(x); + } else { + // For F16/BF16: promote to float, compute, then cast back + float xf; + if constexpr (std::is_same_v) { + xf = __half2float(x); + return __float2half_rn(erff(xf)); + } else if constexpr (std::is_same_v) { + xf = __bfloat162float(x); + return __float2bfloat16_rn(erff(xf)); + } else { + xf = static_cast(x); + return static_cast(erff(xf)); + } + } + } +}; + +} // namespace op::cuda diff --git a/src/infiniop/ops/erf/erf.h b/src/infiniop/ops/erf/erf.h new file mode 100644 index 000000000..7c967dea2 --- /dev/null +++ b/src/infiniop/ops/erf/erf.h @@ -0,0 +1,8 @@ +#ifndef __ERF_H__ +#define __ERF_H__ + +#include "../../elementwise/elementwise.h" + +#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erf, NAMESPACE) + +#endif // __ERF_H__ diff --git a/src/infiniop/ops/erf/metax/erf_metax.h b/src/infiniop/ops/erf/metax/erf_metax.h new file mode 100644 index 000000000..5dfe23bbe --- /dev/null +++ b/src/infiniop/ops/erf/metax/erf_metax.h @@ -0,0 +1,8 @@ +#ifndef __ERF_METAX_API_H__ +#define __ERF_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(erf, metax) + +#endif // __ERF_METAX_API_H__ diff --git a/src/infiniop/ops/erf/metax/erf_metax.maca b/src/infiniop/ops/erf/metax/erf_metax.maca new file mode 100644 index 000000000..fcf956af2 --- /dev/null +++ b/src/infiniop/ops/erf/metax/erf_metax.maca @@ -0,0 +1,60 @@ +#include "erf_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::erf::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erf::metax diff --git a/src/infiniop/ops/erf/moore/erf_moore.h b/src/infiniop/ops/erf/moore/erf_moore.h new file mode 100644 index 000000000..620055688 --- /dev/null +++ b/src/infiniop/ops/erf/moore/erf_moore.h @@ -0,0 +1,8 @@ +#ifndef __ERF_MOORE_API_H__ +#define __ERF_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(erf, moore) + +#endif // __ERF_MOORE_API_H__ diff --git a/src/infiniop/ops/erf/moore/erf_moore.mu b/src/infiniop/ops/erf/moore/erf_moore.mu new file mode 100644 index 000000000..1f717fa51 --- /dev/null +++ b/src/infiniop/ops/erf/moore/erf_moore.mu @@ -0,0 +1,60 @@ +#include "erf_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "erf_moore_kernel.h" + +namespace op::erf::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::ErfOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::ErfOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::ErfOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erf::moore diff --git a/src/infiniop/ops/erf/moore/erf_moore_kernel.h b/src/infiniop/ops/erf/moore/erf_moore_kernel.h new file mode 100644 index 000000000..8ddc9d5f1 --- /dev/null +++ b/src/infiniop/ops/erf/moore/erf_moore_kernel.h @@ -0,0 +1,36 @@ +#ifndef __ERF_MOORE_KERNEL_H__ +#define __ERF_MOORE_KERNEL_H__ + +#include +#include +#include +#include + +namespace op::erf::moore { + +typedef struct ErfOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + float x0 = __low2float(x); + float x1 = __high2float(x); + return __floats2half2_rn(erff(x0), erff(x1)); + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + return __float2half(erff(xf)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + return __float2bfloat16_rn(erff(xf)); + } else if constexpr (std::is_same_v) { + return erff(x); + } else { // double + return erf(x); + } + } +} ErfOp; + +} // namespace op::erf::moore + +#endif // __ERF_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu new file mode 100644 index 000000000..03e14bb57 --- /dev/null +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu @@ -0,0 +1,58 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "erf_nvidia.cuh" + +namespace op::erf::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erf::nvidia diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh new file mode 100644 index 000000000..d20658027 --- /dev/null +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ERF_NVIDIA_H__ +#define __ERF_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(erf, nvidia) + +#endif // __ERF_NVIDIA_H__ diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc new file mode 100644 index 000000000..f9b61e981 --- /dev/null +++ b/src/infiniop/ops/erf/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/erf.h" + +#ifdef ENABLE_CPU_API +#include "cpu/erf_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/erf_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/erf_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/erf_moore.h" +#endif + +__C __export infiniStatus_t infiniopCreateErfDescriptor( + infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erf::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C __export infiniStatus_t infiniopErf( + infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C __export infiniStatus_t +infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/erfc/cpu/erfc_cpu.cc b/src/infiniop/ops/erfc/cpu/erfc_cpu.cc new file mode 100644 index 000000000..35b82c678 --- /dev/null +++ b/src/infiniop/ops/erfc/cpu/erfc_cpu.cc @@ -0,0 +1,52 @@ +#include "erfc_cpu.h" + +namespace op::erfc::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfc::cpu diff --git a/src/infiniop/ops/erfc/cpu/erfc_cpu.h b/src/infiniop/ops/erfc/cpu/erfc_cpu.h new file mode 100644 index 000000000..dd6d69496 --- /dev/null +++ b/src/infiniop/ops/erfc/cpu/erfc_cpu.h @@ -0,0 +1,20 @@ +#ifndef __ERFC_CPU_H__ +#define __ERFC_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(erfc, cpu) + +namespace op::erfc::cpu { +typedef struct ErfcOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return std::erfc(x); + } +} ErfcOp; +} // namespace op::erfc::cpu + +#endif // __ERFC_CPU_H__ diff --git a/src/infiniop/ops/erfc/cuda/kernel.cuh b/src/infiniop/ops/erfc/cuda/kernel.cuh new file mode 100644 index 000000000..aae8efcee --- /dev/null +++ b/src/infiniop/ops/erfc/cuda/kernel.cuh @@ -0,0 +1,36 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace op::cuda { + +struct ErfcOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(T x) const { + if constexpr (std::is_same_v) { + return erfcf(x); + } else if constexpr (std::is_same_v) { + return erfc(x); + } else { + // For F16/BF16: promote to float, compute, then cast back + float xf; + if constexpr (std::is_same_v) { + xf = __half2float(x); + return __float2half_rn(erfcf(xf)); + } else if constexpr (std::is_same_v) { + xf = __bfloat162float(x); + return __float2bfloat16_rn(erfcf(xf)); + } else { + xf = static_cast(x); + return static_cast(erfcf(xf)); + } + } + } +}; + +} // namespace op::cuda diff --git a/src/infiniop/ops/erfc/erfc.h b/src/infiniop/ops/erfc/erfc.h new file mode 100644 index 000000000..9ee12fd43 --- /dev/null +++ b/src/infiniop/ops/erfc/erfc.h @@ -0,0 +1,8 @@ +#ifndef __ERFC_H__ +#define __ERFC_H__ + +#include "../../elementwise/elementwise.h" + +#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erfc, NAMESPACE) + +#endif // __ERFC_H__ diff --git a/src/infiniop/ops/erfc/metax/erfc_metax.h b/src/infiniop/ops/erfc/metax/erfc_metax.h new file mode 100644 index 000000000..438f00095 --- /dev/null +++ b/src/infiniop/ops/erfc/metax/erfc_metax.h @@ -0,0 +1,8 @@ +#ifndef __ERFC_METAX_API_H__ +#define __ERFC_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(erfc, metax) + +#endif // __ERFC_METAX_API_H__ diff --git a/src/infiniop/ops/erfc/metax/erfc_metax.maca b/src/infiniop/ops/erfc/metax/erfc_metax.maca new file mode 100644 index 000000000..925e52ccf --- /dev/null +++ b/src/infiniop/ops/erfc/metax/erfc_metax.maca @@ -0,0 +1,60 @@ +#include "erfc_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::erfc::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfc::metax diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.h b/src/infiniop/ops/erfc/moore/erfc_moore.h new file mode 100644 index 000000000..d032e4305 --- /dev/null +++ b/src/infiniop/ops/erfc/moore/erfc_moore.h @@ -0,0 +1,8 @@ +#ifndef __ERFC_MOORE_API_H__ +#define __ERFC_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(erfc, moore) + +#endif // __ERFC_MOORE_API_H__ diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.mu b/src/infiniop/ops/erfc/moore/erfc_moore.mu new file mode 100644 index 000000000..d1eaec1bf --- /dev/null +++ b/src/infiniop/ops/erfc/moore/erfc_moore.mu @@ -0,0 +1,60 @@ +#include "erfc_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "erfc_moore_kernel.h" + +namespace op::erfc::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::ErfcOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::ErfcOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::ErfcOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfc::moore diff --git a/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h b/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h new file mode 100644 index 000000000..cd5225c3b --- /dev/null +++ b/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h @@ -0,0 +1,36 @@ +#ifndef __ERFC_MOORE_KERNEL_H__ +#define __ERFC_MOORE_KERNEL_H__ + +#include +#include +#include +#include + +namespace op::erfc::moore { + +typedef struct ErfcOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + float x0 = __low2float(x); + float x1 = __high2float(x); + return __floats2half2_rn(erfcf(x0), erfcf(x1)); + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + return __float2half(erfcf(xf)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + return __float2bfloat16_rn(erfcf(xf)); + } else if constexpr (std::is_same_v) { + return erfcf(x); + } else { // double + return erfc(x); + } + } +} ErfcOp; + +} // namespace op::erfc::moore + +#endif // __ERFC_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu new file mode 100644 index 000000000..483f11a18 --- /dev/null +++ b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu @@ -0,0 +1,58 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "erfc_nvidia.cuh" + +namespace op::erfc::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erfc::nvidia diff --git a/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh new file mode 100644 index 000000000..4d5321c9d --- /dev/null +++ b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ERFC_NVIDIA_H__ +#define __ERFC_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(erfc, nvidia) + +#endif // __ERFC_NVIDIA_H__ diff --git a/src/infiniop/ops/erfc/operator.cc b/src/infiniop/ops/erfc/operator.cc new file mode 100644 index 000000000..fa102c90c --- /dev/null +++ b/src/infiniop/ops/erfc/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/erfc.h" + +#ifdef ENABLE_CPU_API +#include "cpu/erfc_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/erfc_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/erfc_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/erfc_moore.h" +#endif + +__C __export infiniStatus_t infiniopCreateErfcDescriptor( + infiniopHandle_t handle, + infiniopErfcDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erfc::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C __export infiniStatus_t infiniopErfc( + infiniopErfcDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C __export infiniStatus_t +infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc new file mode 100644 index 000000000..16c5c8cba --- /dev/null +++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc @@ -0,0 +1,52 @@ +#include "erfinv_cpu.h" + +namespace op::erfinv::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfinv::cpu diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h new file mode 100644 index 000000000..8cc218a1d --- /dev/null +++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h @@ -0,0 +1,46 @@ +#ifndef __ERFINV_CPU_H__ +#define __ERFINV_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include +#include + +ELEMENTWISE_DESCRIPTOR(erfinv, cpu) + +namespace op::erfinv::cpu { + +// Inverse error function implementation using Newton's method +template +T erfinv_impl(T x) { + // Domain: x in (-1, 1) + if (x == 1.0) return std::numeric_limits::infinity(); + if (x == -1.0) return -std::numeric_limits::infinity(); + if (x > 1.0 || x < -1.0) return std::numeric_limits::quiet_NaN(); + if (x == 0.0) return 0.0; + + // Use Newton's method to solve erf(y) = x + T y = x; // Initial guess + const int max_iter = 10; + const T tol = static_cast(1e-10); + + for (int i = 0; i < max_iter; ++i) { + T erf_y = std::erf(y); + T derf_dy = 2.0 / std::sqrt(3.14159265358979323846) * std::exp(-y * y); + T error = erf_y - x; + if (std::abs(error) < tol) break; + y = y - error / derf_dy; + } + return y; +} + +typedef struct ErfinvOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return erfinv_impl(x); + } +} ErfinvOp; +} // namespace op::erfinv::cpu + +#endif // __ERFINV_CPU_H__ diff --git a/src/infiniop/ops/erfinv/cuda/kernel.cuh b/src/infiniop/ops/erfinv/cuda/kernel.cuh new file mode 100644 index 000000000..4f4660e80 --- /dev/null +++ b/src/infiniop/ops/erfinv/cuda/kernel.cuh @@ -0,0 +1,113 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace op::cuda { + +// Inverse error function. +// +// We use a Winitzki-style approximation for an initial guess, then refine with +// a few Newton iterations. Starting with y=x converges poorly for x close to 1, +// which appears frequently in test inputs (torch.rand in [0,1)). +__device__ __forceinline__ float erfinv_impl(float x) { + if (x == 1.0f) return CUDART_INF_F; + if (x == -1.0f) return -CUDART_INF_F; + if (x > 1.0f || x < -1.0f) return CUDART_NAN_F; + if (x == 0.0f) return 0.0f; + + // Winitzki approximation (a = 0.147) for initial guess. + // See: https://arxiv.org/abs/math/0306301 (and common implementations). + const float a = 0.147f; + const float ln = log1pf(-x * x); // ln(1 - x^2) <= 0 + const float t = 2.0f / (CUDART_PI_F * a) + ln * 0.5f; + float inside = t * t - ln / a; + inside = inside > 0.0f ? inside : 0.0f; + float y0 = copysignf(sqrtf(sqrtf(inside) - t), x); + + // Fast path: a few Newton steps in float. + // This is sufficient for most x and much faster than always refining in double. + float y = y0; + const float sqrt_pi_f = 1.7724538509055159f; // sqrt(pi) +#pragma unroll + for (int i = 0; i < 4; ++i) { + const float erf_y = erff(y); + const float derf_dy = 2.0f / sqrt_pi_f * expf(-y * y); + y = y - (erf_y - x) / derf_dy; + } + + // Hybrid slow path: only for values extremely close to ±1 where float erf + // quantization can cause Newton iterations to stagnate, leading to noticeable + // absolute error in y (even if erff(y) == x in float). + // + // The threshold is chosen so the slow path is taken very rarely for typical + // random inputs, minimizing warp divergence and preserving performance. + const float ax = fabsf(x); + if (1.0f - ax < 1e-4f) { + const double xd = static_cast(x); + double yd = static_cast(y); + const double sqrt_pi = 1.7724538509055159; // sqrt(pi) +#pragma unroll + for (int i = 0; i < 4; ++i) { + const double erf_y = erf(yd); + const double derf_dy = 2.0 / sqrt_pi * exp(-yd * yd); + yd = yd - (erf_y - xd) / derf_dy; + } + y = static_cast(yd); + } + + return y; +} + +struct ErfinvOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(T x) const { + if constexpr (std::is_same_v) { + return erfinv_impl(x); + } else if constexpr (std::is_same_v) { + // For double, use similar approach + if (x == 1.0) return CUDART_INF; + if (x == -1.0) return -CUDART_INF; + if (x > 1.0 || x < -1.0) return CUDART_NAN; + if (x == 0.0) return 0.0; + const double a = 0.147; + const double ln = log1p(-x * x); + const double t = 2.0 / (CUDART_PI * a) + ln * 0.5; + double inside = t * t - ln / a; + inside = inside > 0.0 ? inside : 0.0; + double y = copysign(sqrt(sqrt(inside) - t), x); + + const int max_iter = 30; + const double tol = 1e-14; + const double sqrt_pi = 1.7724538509055159; + for (int i = 0; i < max_iter; ++i) { + const double erf_y = erf(y); + const double error = erf_y - x; + if (fabs(error) < tol) break; + const double derf_dy = 2.0 / sqrt_pi * exp(-y * y); + y = y - error / derf_dy; + } + return y; + } else { + // For F16/BF16: promote to float, compute, then cast back + float xf; + if constexpr (std::is_same_v) { + xf = __half2float(x); + return __float2half_rn(erfinv_impl(xf)); + } else if constexpr (std::is_same_v) { + xf = __bfloat162float(x); + return __float2bfloat16_rn(erfinv_impl(xf)); + } else { + xf = static_cast(x); + return static_cast(erfinv_impl(xf)); + } + } + } +}; + +} // namespace op::cuda diff --git a/src/infiniop/ops/erfinv/erfinv.h b/src/infiniop/ops/erfinv/erfinv.h new file mode 100644 index 000000000..f3ed9350f --- /dev/null +++ b/src/infiniop/ops/erfinv/erfinv.h @@ -0,0 +1,8 @@ +#ifndef __ERFINV_H__ +#define __ERFINV_H__ + +#include "../../elementwise/elementwise.h" + +#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erfinv, NAMESPACE) + +#endif // __ERFINV_H__ diff --git a/src/infiniop/ops/erfinv/metax/erfinv_metax.h b/src/infiniop/ops/erfinv/metax/erfinv_metax.h new file mode 100644 index 000000000..05058bfc6 --- /dev/null +++ b/src/infiniop/ops/erfinv/metax/erfinv_metax.h @@ -0,0 +1,8 @@ +#ifndef __ERFINV_METAX_API_H__ +#define __ERFINV_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(erfinv, metax) + +#endif // __ERFINV_METAX_API_H__ diff --git a/src/infiniop/ops/erfinv/metax/erfinv_metax.maca b/src/infiniop/ops/erfinv/metax/erfinv_metax.maca new file mode 100644 index 000000000..1e9144074 --- /dev/null +++ b/src/infiniop/ops/erfinv/metax/erfinv_metax.maca @@ -0,0 +1,60 @@ +#include "erfinv_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::erfinv::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfinv::metax diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.h b/src/infiniop/ops/erfinv/moore/erfinv_moore.h new file mode 100644 index 000000000..9eed18024 --- /dev/null +++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.h @@ -0,0 +1,8 @@ +#ifndef __ERFINV_MOORE_API_H__ +#define __ERFINV_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(erfinv, moore) + +#endif // __ERFINV_MOORE_API_H__ diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.mu b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu new file mode 100644 index 000000000..54b5830ea --- /dev/null +++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu @@ -0,0 +1,60 @@ +#include "erfinv_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "erfinv_moore_kernel.h" + +namespace op::erfinv::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::ErfinvOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::ErfinvOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::ErfinvOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfinv::moore diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h b/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h new file mode 100644 index 000000000..e3f3bb5f2 --- /dev/null +++ b/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h @@ -0,0 +1,72 @@ +#ifndef __ERFINV_MOORE_KERNEL_H__ +#define __ERFINV_MOORE_KERNEL_H__ + +#include +#include +#include +#include + +namespace op::erfinv::moore { + +// Inverse error function using Newton's method +template +__device__ __forceinline__ T erfinv_impl(T x) { + if (x >= 1.0f) return CUDART_INF_F; + if (x <= -1.0f) return -CUDART_INF_F; + if (x == 0.0f) return 0.0f; + + T y = x; + const int max_iter = 10; + const T tol = 1e-10f; + const T sqrt_pi = 1.7724538509055159f; + + for (int i = 0; i < max_iter; ++i) { + T erf_y = erff(y); + T derf_dy = 2.0f / sqrt_pi * expf(-y * y); + T error = erf_y - x; + if (fabsf(error) < tol) break; + y = y - error / derf_dy; + } + return y; +} + +typedef struct ErfinvOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + float x0 = __low2float(x); + float x1 = __high2float(x); + return __floats2half2_rn(erfinv_impl(x0), erfinv_impl(x1)); + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + return __float2half(erfinv_impl(xf)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + return __float2bfloat16_rn(erfinv_impl(xf)); + } else if constexpr (std::is_same_v) { + return erfinv_impl(x); + } else { // double + if (x >= 1.0) return CUDART_INF; + if (x <= -1.0) return -CUDART_INF; + if (x == 0.0) return 0.0; + double y = x; + const int max_iter = 10; + const double tol = 1e-10; + const double sqrt_pi = 1.7724538509055159; + for (int i = 0; i < max_iter; ++i) { + double erf_y = erf(y); + double derf_dy = 2.0 / sqrt_pi * exp(-y * y); + double error = erf_y - x; + if (fabs(error) < tol) break; + y = y - error / derf_dy; + } + return y; + } + } +} ErfinvOp; + +} // namespace op::erfinv::moore + +#endif // __ERFINV_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu new file mode 100644 index 000000000..35f5d3fe2 --- /dev/null +++ b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu @@ -0,0 +1,58 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "erfinv_nvidia.cuh" + +namespace op::erfinv::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erfinv::nvidia diff --git a/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh new file mode 100644 index 000000000..af80be12f --- /dev/null +++ b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ERFINV_NVIDIA_H__ +#define __ERFINV_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(erfinv, nvidia) + +#endif // __ERFINV_NVIDIA_H__ diff --git a/src/infiniop/ops/erfinv/operator.cc b/src/infiniop/ops/erfinv/operator.cc new file mode 100644 index 000000000..c7c360bec --- /dev/null +++ b/src/infiniop/ops/erfinv/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/erfinv.h" + +#ifdef ENABLE_CPU_API +#include "cpu/erfinv_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/erfinv_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/erfinv_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/erfinv_moore.h" +#endif + +__C __export infiniStatus_t infiniopCreateErfinvDescriptor( + infiniopHandle_t handle, + infiniopErfinvDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erfinv::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C __export infiniStatus_t infiniopErfinv( + infiniopErfinvDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C __export infiniStatus_t +infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc new file mode 100644 index 000000000..22a792012 --- /dev/null +++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc @@ -0,0 +1,201 @@ +#include "matrix_power_cpu.h" +#include "../../../utils.h" +#include +#include + +namespace op::matrix_power::cpu { + +utils::Result MatrixPowerInfo::create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int n) { + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (y_shape != x_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (n < 0) { + return INFINI_STATUS_BAD_PARAM; + } + + MatrixPowerInfo info; + info.matrix_size = x_shape[0]; + info.n = static_cast(n); + info.input_size = x_desc->numel(); + info.output_size = y_desc->numel(); + + return utils::Result(std::move(info)); +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + auto info_result = MatrixPowerInfo::create(x_desc, y_desc, n); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +void matrix_power_impl( + const MatrixPowerInfo &info, + T *y, + const T *x, + void *workspace) { + + size_t n = info.matrix_size; + int power = static_cast(info.n); + + // Use workspace for temporary matrices + T *temp1 = reinterpret_cast(workspace); + T *temp2 = temp1 + n * n; + + // Initialize result as identity matrix + std::memset(y, 0, n * n * sizeof(T)); + for (size_t i = 0; i < n; ++i) { + y[i * n + i] = utils::cast(1.0); + } + + // Copy input to temp1 + std::memcpy(temp1, x, n * n * sizeof(T)); + + // Binary exponentiation + while (power > 0) { + if (power & 1) { + // Multiply result by temp1 + std::memset(temp2, 0, n * n * sizeof(T)); + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + T val = y[i * n + k]; + for (size_t j = 0; j < n; ++j) { + temp2[i * n + j] += val * temp1[k * n + j]; + } + } + } + std::memcpy(y, temp2, n * n * sizeof(T)); + } + // Square temp1 + std::memset(temp2, 0, n * n * sizeof(T)); + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + T val = temp1[i * n + k]; + for (size_t j = 0; j < n; ++j) { + temp2[i * n + j] += val * temp1[k * n + j]; + } + } + } + std::memcpy(temp1, temp2, n * n * sizeof(T)); + power >>= 1; + } +} + +template +void write_identity_impl(size_t n, T *y) { + std::fill(y, y + n * n, utils::cast(0.0)); + for (size_t i = 0; i < n; ++i) { + y[i * n + i] = utils::cast(1.0); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (_info.matrix_size == 0) { + return INFINI_STATUS_SUCCESS; + } + if (_info.n == 0) { + const size_t n = _info.matrix_size; + switch (_dtype) { + case INFINI_DTYPE_F16: + write_identity_impl(n, reinterpret_cast(y)); + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_BF16: + write_identity_impl(n, reinterpret_cast(y)); + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F32: + write_identity_impl(n, reinterpret_cast(y)); + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F64: + write_identity_impl(n, reinterpret_cast(y)); + return INFINI_STATUS_SUCCESS; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + + if (workspace_size < this->workspaceSize()) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + if (this->workspaceSize() > 0 && workspace == nullptr) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: { + // Convert to float for computation + std::vector x_f(_info.input_size); + std::vector y_f(_info.output_size); + float *workspace_f = reinterpret_cast(workspace); + for (size_t i = 0; i < _info.input_size; ++i) { + x_f[i] = utils::cast(reinterpret_cast(x)[i]); + } + MatrixPowerInfo info_f = _info; + matrix_power_impl(info_f, y_f.data(), x_f.data(), workspace_f); + for (size_t i = 0; i < _info.output_size; ++i) { + reinterpret_cast(y)[i] = utils::cast(y_f[i]); + } + break; + } + case INFINI_DTYPE_BF16: { + std::vector x_f(_info.input_size); + std::vector y_f(_info.output_size); + float *workspace_f = reinterpret_cast(workspace); + for (size_t i = 0; i < _info.input_size; ++i) { + x_f[i] = utils::cast(reinterpret_cast(x)[i]); + } + MatrixPowerInfo info_f = _info; + matrix_power_impl(info_f, y_f.data(), x_f.data(), workspace_f); + for (size_t i = 0; i < _info.output_size; ++i) { + reinterpret_cast(y)[i] = utils::cast(y_f[i]); + } + break; + } + case INFINI_DTYPE_F32: + matrix_power_impl(_info, reinterpret_cast(y), reinterpret_cast(x), workspace); + break; + case INFINI_DTYPE_F64: + matrix_power_impl(_info, reinterpret_cast(y), reinterpret_cast(x), workspace); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::matrix_power::cpu diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h new file mode 100644 index 000000000..9c6f2ebc1 --- /dev/null +++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h @@ -0,0 +1,69 @@ +#ifndef __MATRIX_POWER_CPU_H__ +#define __MATRIX_POWER_CPU_H__ + +#include "../../../operator.h" +#include "../../../devices/cpu/common_cpu.h" +#include + +namespace op::matrix_power::cpu { + +struct MatrixPowerInfo { + size_t matrix_size; // N x N matrix + size_t n; // Power + size_t input_size; + size_t output_size; + + static utils::Result create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int n); +}; + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + MatrixPowerInfo _info; + + Descriptor(infiniDtype_t dtype, MatrixPowerInfo info, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n); + + size_t workspaceSize() const { + if (_info.n == 0 || _info.matrix_size == 0) { + return 0; + } + const size_t elems = 2 * _info.matrix_size * _info.matrix_size; + switch (_dtype) { + case INFINI_DTYPE_F16: + case INFINI_DTYPE_BF16: + case INFINI_DTYPE_F32: + return elems * sizeof(float); + case INFINI_DTYPE_F64: + return elems * sizeof(double); + default: + return 0; + } + } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::matrix_power::cpu + +#endif // __MATRIX_POWER_CPU_H__ diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h new file mode 100644 index 000000000..02ff4b14d --- /dev/null +++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h @@ -0,0 +1,48 @@ +#ifndef __MATRIX_POWER_METAX_H__ +#define __MATRIX_POWER_METAX_H__ + +#include "../../../operator.h" +#include "../../../devices/metax/metax_common.h" + +namespace op::matrix_power::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t matrix_size; + size_t n; + size_t input_size; + size_t output_size; + + Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n, + size_t input_size, size_t output_size, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + matrix_size(matrix_size), + n(n), + input_size(input_size), + output_size(output_size) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::matrix_power::metax + +#endif // __MATRIX_POWER_METAX_H__ diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca new file mode 100644 index 000000000..0e7f58f19 --- /dev/null +++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca @@ -0,0 +1,107 @@ +#include "matrix_power_metax.h" +#include "../../../utils.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include +#include +#include + +namespace op::matrix_power::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (y_shape != x_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + CHECK_OR_RETURN(n >= 0, INFINI_STATUS_BAD_PARAM); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + *desc_ptr = new Descriptor(dtype, x_shape[0], static_cast(n), + x_desc->numel(), y_desc->numel(), + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (workspace_size < this->workspaceSize()) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + auto hc_stream = reinterpret_cast(stream); + CHECK_OR_RETURN(_dtype == INFINI_DTYPE_F32, INFINI_STATUS_BAD_TENSOR_DTYPE); + size_t input_bytes = input_size * sizeof(float); + + // Use CPU fallback for now + std::vector h_matrix(input_size); + CHECK_METAX(hcMemcpyAsync(h_matrix.data(), x, input_bytes, hcMemcpyDeviceToHost, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + + std::vector result(output_size, 0.0f); + std::vector temp1(input_size); + std::vector temp2(input_size); + std::memcpy(temp1.data(), h_matrix.data(), input_bytes); + + for (size_t i = 0; i < matrix_size; ++i) { + result[i * matrix_size + i] = 1.0f; + } + + int power = static_cast(n); + while (power > 0) { + if (power & 1) { + std::fill(temp2.begin(), temp2.end(), 0.0f); + for (size_t i = 0; i < matrix_size; ++i) { + for (size_t k = 0; k < matrix_size; ++k) { + float val = result[i * matrix_size + k]; + for (size_t j = 0; j < matrix_size; ++j) { + temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j]; + } + } + } + std::memcpy(result.data(), temp2.data(), output_size * sizeof(float)); + } + std::fill(temp2.begin(), temp2.end(), 0.0f); + for (size_t i = 0; i < matrix_size; ++i) { + for (size_t k = 0; k < matrix_size; ++k) { + float val = temp1[i * matrix_size + k]; + for (size_t j = 0; j < matrix_size; ++j) { + temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j]; + } + } + } + std::memcpy(temp1.data(), temp2.data(), input_bytes); + power >>= 1; + } + + CHECK_METAX(hcMemcpyAsync(y, result.data(), input_bytes, hcMemcpyHostToDevice, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::matrix_power::metax diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h new file mode 100644 index 000000000..a58428a6e --- /dev/null +++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h @@ -0,0 +1,48 @@ +#ifndef __MATRIX_POWER_MOORE_H__ +#define __MATRIX_POWER_MOORE_H__ + +#include "../../../operator.h" +#include "../../../devices/moore/moore_common.h" + +namespace op::matrix_power::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t matrix_size; + size_t n; + size_t input_size; + size_t output_size; + + Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n, + size_t input_size, size_t output_size, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + matrix_size(matrix_size), + n(n), + input_size(input_size), + output_size(output_size) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::matrix_power::moore + +#endif // __MATRIX_POWER_MOORE_H__ diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu new file mode 100644 index 000000000..532480955 --- /dev/null +++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu @@ -0,0 +1,106 @@ +#include "matrix_power_moore.h" +#include "../../../utils.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include +#include +#include + +namespace op::matrix_power::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (y_shape != x_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + CHECK_OR_RETURN(n >= 0, INFINI_STATUS_BAD_PARAM); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + *desc_ptr = new Descriptor(dtype, x_shape[0], static_cast(n), + x_desc->numel(), y_desc->numel(), + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (workspace_size < this->workspaceSize()) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + auto musa_stream = reinterpret_cast(stream); + CHECK_OR_RETURN(_dtype == INFINI_DTYPE_F32, INFINI_STATUS_BAD_TENSOR_DTYPE); + size_t input_bytes = input_size * sizeof(float); + + std::vector h_matrix(input_size); + CHECK_MOORE(musaMemcpyAsync(h_matrix.data(), x, input_bytes, musaMemcpyDeviceToHost, musa_stream)); + CHECK_MOORE(musaStreamSynchronize(musa_stream)); + + std::vector result(output_size, 0.0f); + std::vector temp1(input_size); + std::vector temp2(input_size); + std::memcpy(temp1.data(), h_matrix.data(), input_bytes); + + for (size_t i = 0; i < matrix_size; ++i) { + result[i * matrix_size + i] = 1.0f; + } + + int power = static_cast(n); + while (power > 0) { + if (power & 1) { + std::fill(temp2.begin(), temp2.end(), 0.0f); + for (size_t i = 0; i < matrix_size; ++i) { + for (size_t k = 0; k < matrix_size; ++k) { + float val = result[i * matrix_size + k]; + for (size_t j = 0; j < matrix_size; ++j) { + temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j]; + } + } + } + std::memcpy(result.data(), temp2.data(), output_size * sizeof(float)); + } + std::fill(temp2.begin(), temp2.end(), 0.0f); + for (size_t i = 0; i < matrix_size; ++i) { + for (size_t k = 0; k < matrix_size; ++k) { + float val = temp1[i * matrix_size + k]; + for (size_t j = 0; j < matrix_size; ++j) { + temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j]; + } + } + } + std::memcpy(temp1.data(), temp2.data(), input_bytes); + power >>= 1; + } + + CHECK_MOORE(musaMemcpyAsync(y, result.data(), input_bytes, musaMemcpyHostToDevice, musa_stream)); + CHECK_MOORE(musaStreamSynchronize(musa_stream)); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::matrix_power::moore diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu new file mode 100644 index 000000000..e6164c8ca --- /dev/null +++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu @@ -0,0 +1,548 @@ +#include "matrix_power_nvidia.cuh" +#include "../../../utils.h" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include +#include +#include +#include + +namespace op::matrix_power::nvidia { + +namespace { + +template +__forceinline__ __device__ T identityZero(); + +template +__forceinline__ __device__ T identityOne(); + +template <> +__forceinline__ __device__ __half identityZero<__half>() { + return __float2half(0.0f); +} + +template <> +__forceinline__ __device__ __half identityOne<__half>() { + return __float2half(1.0f); +} + +template <> +__forceinline__ __device__ cuda_bfloat16 identityZero() { + return __float2bfloat16(0.0f); +} + +template <> +__forceinline__ __device__ cuda_bfloat16 identityOne() { + return __float2bfloat16(1.0f); +} + +template <> +__forceinline__ __device__ float identityZero() { + return 0.0f; +} + +template <> +__forceinline__ __device__ float identityOne() { + return 1.0f; +} + +template <> +__forceinline__ __device__ double identityZero() { + return 0.0; +} + +template <> +__forceinline__ __device__ double identityOne() { + return 1.0; +} + +template +INFINIOP_CUDA_KERNEL packMatrix2dStridedToContiguous( + const T *src, + T *dst, + size_t matrix_size, + ptrdiff_t src_stride_0, + ptrdiff_t src_stride_1) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t matrix_numel = matrix_size * matrix_size; + if (idx < matrix_numel) { + size_t row = idx / matrix_size; + size_t col = idx - row * matrix_size; + dst[idx] = src[row * src_stride_0 + col * src_stride_1]; + } +} + +template +INFINIOP_CUDA_KERNEL scatterMatrix2dContiguousToStrided( + const T *src, + T *dst, + size_t matrix_size, + ptrdiff_t dst_stride_0, + ptrdiff_t dst_stride_1) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t matrix_numel = matrix_size * matrix_size; + if (idx < matrix_numel) { + size_t row = idx / matrix_size; + size_t col = idx - row * matrix_size; + dst[row * dst_stride_0 + col * dst_stride_1] = src[idx]; + } +} + +template +INFINIOP_CUDA_KERNEL setIdentity2dStrided( + T *out, + size_t matrix_size, + ptrdiff_t out_stride_0, + ptrdiff_t out_stride_1) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t matrix_numel = matrix_size * matrix_size; + if (idx < matrix_numel) { + size_t row = idx / matrix_size; + size_t col = idx - row * matrix_size; + out[row * out_stride_0 + col * out_stride_1] = (row == col) ? identityOne() : identityZero(); + } +} + +INFINIOP_CUDA_KERNEL setDiagonalFp16(__half *out, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx * n + idx] = __float2half(1.0f); + } +} + +INFINIOP_CUDA_KERNEL setDiagonalBf16(cuda_bfloat16 *out, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx * n + idx] = __float2bfloat16(1.0f); + } +} + +INFINIOP_CUDA_KERNEL setDiagonalFp32(float *out, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx * n + idx] = 1.0f; + } +} + +INFINIOP_CUDA_KERNEL setDiagonalFp64(double *out, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx * n + idx] = 1.0; + } +} + +infiniStatus_t initializeIdentity( + void *y, + infiniDtype_t dtype, + size_t matrix_size, + bool y_contiguous, + ptrdiff_t y_stride_0, + ptrdiff_t y_stride_1, + cudaStream_t stream) { + + if (matrix_size == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int threads = 256; + size_t diag_blocks = CEIL_DIV(matrix_size, static_cast(threads)); + size_t matrix_numel = matrix_size * matrix_size; + size_t matrix_blocks = CEIL_DIV(matrix_numel, static_cast(threads)); + + if (y_contiguous) { + CHECK_CUDA(cudaMemsetAsync(y, 0, matrix_numel * infiniSizeOf(dtype), stream)); + } + + switch (dtype) { + case INFINI_DTYPE_F16: + if (y_contiguous) { + setDiagonalFp16<<(diag_blocks), threads, 0, stream>>>( + reinterpret_cast<__half *>(y), matrix_size); + } else { + setIdentity2dStrided<<(matrix_blocks), threads, 0, stream>>>( + reinterpret_cast<__half *>(y), matrix_size, y_stride_0, y_stride_1); + } + break; + case INFINI_DTYPE_BF16: + if (y_contiguous) { + setDiagonalBf16<<(diag_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size); + } else { + setIdentity2dStrided<<(matrix_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size, y_stride_0, y_stride_1); + } + break; + case INFINI_DTYPE_F32: + if (y_contiguous) { + setDiagonalFp32<<(diag_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size); + } else { + setIdentity2dStrided<<(matrix_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size, y_stride_0, y_stride_1); + } + break; + case INFINI_DTYPE_F64: + if (y_contiguous) { + setDiagonalFp64<<(diag_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size); + } else { + setIdentity2dStrided<<(matrix_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size, y_stride_0, y_stride_1); + } + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_CUDA(cudaGetLastError()); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t packInputToContiguous( + void *dst, + const void *src, + infiniDtype_t dtype, + size_t matrix_size, + ptrdiff_t src_stride_0, + ptrdiff_t src_stride_1, + cudaStream_t stream) { + constexpr int threads = 256; + size_t matrix_numel = matrix_size * matrix_size; + size_t blocks = CEIL_DIV(matrix_numel, static_cast(threads)); + switch (dtype) { + case INFINI_DTYPE_F16: + packMatrix2dStridedToContiguous<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast<__half *>(dst), + matrix_size, src_stride_0, src_stride_1); + break; + case INFINI_DTYPE_BF16: + packMatrix2dStridedToContiguous<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, src_stride_0, src_stride_1); + break; + case INFINI_DTYPE_F32: + packMatrix2dStridedToContiguous<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, src_stride_0, src_stride_1); + break; + case INFINI_DTYPE_F64: + packMatrix2dStridedToContiguous<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, src_stride_0, src_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_CUDA(cudaGetLastError()); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t scatterContiguousToOutput( + void *dst, + const void *src, + infiniDtype_t dtype, + size_t matrix_size, + ptrdiff_t dst_stride_0, + ptrdiff_t dst_stride_1, + cudaStream_t stream) { + constexpr int threads = 256; + size_t matrix_numel = matrix_size * matrix_size; + size_t blocks = CEIL_DIV(matrix_numel, static_cast(threads)); + switch (dtype) { + case INFINI_DTYPE_F16: + scatterMatrix2dContiguousToStrided<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast<__half *>(dst), + matrix_size, dst_stride_0, dst_stride_1); + break; + case INFINI_DTYPE_BF16: + scatterMatrix2dContiguousToStrided<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, dst_stride_0, dst_stride_1); + break; + case INFINI_DTYPE_F32: + scatterMatrix2dContiguousToStrided<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, dst_stride_0, dst_stride_1); + break; + case INFINI_DTYPE_F64: + scatterMatrix2dContiguousToStrided<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, dst_stride_0, dst_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_CUDA(cudaGetLastError()); + return INFINI_STATUS_SUCCESS; +} + +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +using GemmComputeType = cudaDataType; +#else +using GemmComputeType = cublasComputeType_t; +#endif + +struct GemmTypeConfig { + cudaDataType io_type; + GemmComputeType compute_type; +}; + +infiniStatus_t getGemmTypeConfig(infiniDtype_t dtype, GemmTypeConfig &cfg) { + switch (dtype) { + case INFINI_DTYPE_F16: + cfg.io_type = CUDA_R_16F; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cfg.compute_type = CUDA_R_32F; +#else + cfg.compute_type = CUBLAS_COMPUTE_32F; +#endif + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_BF16: + cfg.io_type = CUDA_R_16BF; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cfg.compute_type = CUDA_R_32F; +#else + cfg.compute_type = CUBLAS_COMPUTE_32F; +#endif + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F32: + cfg.io_type = CUDA_R_32F; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cfg.compute_type = CUDA_R_32F; +#else + cfg.compute_type = CUBLAS_COMPUTE_32F; +#endif + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F64: + cfg.io_type = CUDA_R_64F; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cfg.compute_type = CUDA_R_64F; +#else + cfg.compute_type = CUBLAS_COMPUTE_64F; +#endif + return INFINI_STATUS_SUCCESS; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +// Compute row-major C = A * B using cuBLAS column-major GEMM: +// C_col = B_col * A_col, where *_col views the same memory as column-major. +infiniStatus_t gemmRowMajorSquare( + cublasHandle_t handle, + const GemmTypeConfig &cfg, + infiniDtype_t dtype, + int n, + const void *a, + const void *b, + void *c) { + + if (dtype == INFINI_DTYPE_F64) { + const double alpha = 1.0; + const double beta = 0.0; + CHECK_CUBLAS(cublasGemmEx( + handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + n, + n, + &alpha, + b, + cfg.io_type, + n, + a, + cfg.io_type, + n, + &beta, + c, + cfg.io_type, + n, + cfg.compute_type, + CUBLAS_GEMM_DEFAULT)); + return INFINI_STATUS_SUCCESS; + } + + const float alpha = 1.0f; + const float beta = 0.0f; + CHECK_CUBLAS(cublasGemmEx( + handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + n, + n, + &alpha, + b, + cfg.io_type, + n, + a, + cfg.io_type, + n, + &beta, + c, + cfg.io_type, + n, + cfg.compute_type, + CUBLAS_GEMM_DEFAULT)); + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +struct Descriptor::Opaque { + std::shared_ptr internal; + + Opaque(std::shared_ptr internal_) + : internal(internal_) {} +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + + if (handle == nullptr || desc_ptr == nullptr || y_desc == nullptr || x_desc == nullptr) { + return INFINI_STATUS_BAD_PARAM; + } + if (n < 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + if (y_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (y_shape != x_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (x_desc->hasBroadcastDim() || y_desc->hasBroadcastDim()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + if (x_shape[0] > static_cast(std::numeric_limits::max())) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_strides = x_desc->strides(); + auto y_strides = y_desc->strides(); + bool x_contiguous = x_desc->isContiguous(); + bool y_contiguous = y_desc->isContiguous(); + + size_t matrix_numel = x_desc->numel(); + size_t matrix_bytes = matrix_numel * infiniSizeOf(dtype); + size_t workspace_size = 0; + if (n != 0) { + workspace_size = matrix_bytes * (y_contiguous ? 2 : 3); + } + + auto handle_nvidia = reinterpret_cast(handle); + Descriptor *desc = new Descriptor(dtype, x_shape[0], static_cast(n), + matrix_numel, y_desc->numel(), workspace_size, + x_strides[0], x_strides[1], y_strides[0], y_strides[1], + x_contiguous, y_contiguous, + handle->device, handle->device_id); + desc->_opaque = new Opaque(handle_nvidia->internal()); + *desc_ptr = desc; + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (x == nullptr || y == nullptr) { + return INFINI_STATUS_BAD_PARAM; + } + if (workspace_size < this->workspaceSize()) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + if (this->workspaceSize() != 0 && workspace == nullptr) { + return INFINI_STATUS_BAD_PARAM; + } + + auto cuda_stream = reinterpret_cast(stream); + if (matrix_size == 0) { + return INFINI_STATUS_SUCCESS; + } + if (n == 0) { + CHECK_STATUS(initializeIdentity( + y, _dtype, matrix_size, y_contiguous, y_stride_0, y_stride_1, cuda_stream)); + return INFINI_STATUS_SUCCESS; + } + + size_t matrix_bytes = input_size * infiniSizeOf(_dtype); + char *workspace_ptr = reinterpret_cast(workspace); + void *base = workspace_ptr; + void *temp = workspace_ptr + matrix_bytes; + void *contiguous_output = y_contiguous ? y : (workspace_ptr + matrix_bytes * 2); + + CHECK_STATUS(initializeIdentity( + contiguous_output, _dtype, matrix_size, true, 0, 0, cuda_stream)); + + if (x_contiguous) { + CHECK_CUDA(cudaMemcpyAsync(base, x, matrix_bytes, cudaMemcpyDeviceToDevice, cuda_stream)); + } else { + CHECK_STATUS(packInputToContiguous( + base, x, _dtype, matrix_size, x_stride_0, x_stride_1, cuda_stream)); + } + + GemmTypeConfig cfg; + CHECK_STATUS(getGemmTypeConfig(_dtype, cfg)); + + void *result = contiguous_output; + void *scratch = temp; + void *base_matrix = base; + size_t power = n; + int matrix_dim = static_cast(matrix_size); + + CHECK_STATUS(_opaque->internal->useCublas( + cuda_stream, + [&](cublasHandle_t handle) { + while (power > 0) { + if (power & 1) { + CHECK_STATUS(gemmRowMajorSquare(handle, cfg, _dtype, matrix_dim, result, base_matrix, scratch)); + std::swap(result, scratch); + } + power >>= 1; + if (power == 0) { + break; + } + CHECK_STATUS(gemmRowMajorSquare(handle, cfg, _dtype, matrix_dim, base_matrix, base_matrix, scratch)); + std::swap(base_matrix, scratch); + } + return INFINI_STATUS_SUCCESS; + })); + + if (y_contiguous) { + if (result != y) { + CHECK_CUDA(cudaMemcpyAsync(y, result, matrix_bytes, cudaMemcpyDeviceToDevice, cuda_stream)); + } + } else { + CHECK_STATUS(scatterContiguousToOutput( + y, result, _dtype, matrix_size, y_stride_0, y_stride_1, cuda_stream)); + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::matrix_power::nvidia diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh new file mode 100644 index 000000000..ebe0ecbdf --- /dev/null +++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh @@ -0,0 +1,72 @@ +#ifndef __MATRIX_POWER_NVIDIA_H__ +#define __MATRIX_POWER_NVIDIA_H__ + +#include "../../../operator.h" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include + +namespace op::matrix_power::nvidia { + +class Descriptor final : public InfiniopDescriptor { + struct Opaque; + Opaque *_opaque; + infiniDtype_t _dtype; + size_t matrix_size; + size_t n; + size_t input_size; + size_t output_size; + size_t workspace_size; + ptrdiff_t x_stride_0; + ptrdiff_t x_stride_1; + ptrdiff_t y_stride_0; + ptrdiff_t y_stride_1; + bool x_contiguous; + bool y_contiguous; + + Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n, + size_t input_size, size_t output_size, + size_t workspace_size, + ptrdiff_t x_stride_0, ptrdiff_t x_stride_1, + ptrdiff_t y_stride_0, ptrdiff_t y_stride_1, + bool x_contiguous, bool y_contiguous, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _opaque(nullptr), + _dtype(dtype), + matrix_size(matrix_size), + n(n), + input_size(input_size), + output_size(output_size), + workspace_size(workspace_size), + x_stride_0(x_stride_0), + x_stride_1(x_stride_1), + y_stride_0(y_stride_0), + y_stride_1(y_stride_1), + x_contiguous(x_contiguous), + y_contiguous(y_contiguous) {} + +public: + ~Descriptor(); + + friend struct Opaque; + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n); + + size_t workspaceSize() const { return workspace_size; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::matrix_power::nvidia + +#endif // __MATRIX_POWER_NVIDIA_H__ diff --git a/src/infiniop/ops/matrix_power/operator.cc b/src/infiniop/ops/matrix_power/operator.cc new file mode 100644 index 000000000..d26e26fd1 --- /dev/null +++ b/src/infiniop/ops/matrix_power/operator.cc @@ -0,0 +1,159 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/matrix_power.h" + +#ifdef ENABLE_CPU_API +#include "cpu/matrix_power_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/matrix_power_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/matrix_power_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/matrix_power_moore.h" +#endif + +__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor( + infiniopHandle_t handle, + infiniopMatrixPowerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::matrix_power::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + n) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C __export infiniStatus_t infiniopMatrixPower( + infiniopMatrixPowerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C __export infiniStatus_t +infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc new file mode 100644 index 000000000..19c1de74b --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc @@ -0,0 +1,137 @@ +#include "pixel_shuffle_cpu.h" +#include "../../../utils.h" + +namespace op::pixel_shuffle::cpu { + +utils::Result PixelShuffleInfo::create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int upscale_factor) { + + if (upscale_factor <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t batch = x_shape[0]; + size_t in_channels = x_shape[1]; + size_t height = x_shape[2]; + size_t width = x_shape[3]; + + // Input: (N, C*r^2, H, W) -> Output: (N, C, H*r, W*r) + if (in_channels % (upscale_factor * upscale_factor) != 0) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t out_channels = in_channels / (upscale_factor * upscale_factor); + size_t out_height = height * upscale_factor; + size_t out_width = width * upscale_factor; + + std::vector expected_y_shape = {batch, out_channels, out_height, out_width}; + if (y_shape != expected_y_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + PixelShuffleInfo info; + info.batch = batch; + info.in_channels = in_channels; + info.out_channels = out_channels; + info.height = height; + info.width = width; + info.upscale_factor = upscale_factor; + info.input_size = x_desc->numel(); + info.output_size = y_desc->numel(); + + return utils::Result(std::move(info)); +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + auto info_result = PixelShuffleInfo::create(x_desc, y_desc, upscale_factor); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +void pixel_shuffle_impl( + const PixelShuffleInfo &info, + T *y, + const T *x) { + + int r = info.upscale_factor; + + // Input: (N, C*r^2, H, W) + // Output: (N, C, H*r, W*r) + for (size_t n = 0; n < info.batch; ++n) { + for (size_t c = 0; c < info.out_channels; ++c) { + for (size_t h = 0; h < info.height; ++h) { + for (size_t w = 0; w < info.width; ++w) { + for (int i = 0; i < r; ++i) { + for (int j = 0; j < r; ++j) { + // Input channel index + size_t in_c = c * r * r + i * r + j; + // Input position + size_t in_idx = ((n * info.in_channels + in_c) * info.height + h) * info.width + w; + // Output position + size_t out_h = h * r + i; + size_t out_w = w * r + j; + size_t out_idx = ((n * info.out_channels + c) * (info.height * r) + out_h) * (info.width * r) + out_w; + y[out_idx] = x[in_idx]; + } + } + } + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + pixel_shuffle_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_BF16: + pixel_shuffle_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_F32: + pixel_shuffle_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_F64: + pixel_shuffle_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::pixel_shuffle::cpu diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h new file mode 100644 index 000000000..d4a1a2b46 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h @@ -0,0 +1,58 @@ +#ifndef __PIXEL_SHUFFLE_CPU_H__ +#define __PIXEL_SHUFFLE_CPU_H__ + +#include "../../../operator.h" +#include "../../../devices/cpu/common_cpu.h" +#include + +namespace op::pixel_shuffle::cpu { + +struct PixelShuffleInfo { + size_t batch; + size_t in_channels; + size_t out_channels; + size_t height; + size_t width; + int upscale_factor; + size_t input_size; + size_t output_size; + + static utils::Result create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int upscale_factor); +}; + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + PixelShuffleInfo _info; + + Descriptor(infiniDtype_t dtype, PixelShuffleInfo info, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::pixel_shuffle::cpu + +#endif // __PIXEL_SHUFFLE_CPU_H__ diff --git a/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh new file mode 100644 index 000000000..758b2ba7a --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh @@ -0,0 +1,95 @@ +#pragma once +#include +#include +#include + +namespace op::cuda { + +template +__global__ void pixel_shuffle_kernel( + T *output, + const T *input, + size_t batch, + size_t out_channels, + size_t height, + size_t width, + int r) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = batch * out_channels * height * width; + + if (idx >= total) return; + + size_t n = idx / (out_channels * height * width); + size_t rem = idx % (out_channels * height * width); + size_t c = rem / (height * width); + rem = rem % (height * width); + size_t oh = rem / width; + size_t ow = rem % width; + + // Calculate input indices + size_t w = ow / r; + size_t h = oh / r; + size_t i = oh % r; + size_t j = ow % r; + size_t in_c = c * r * r + i * r + j; + + size_t in_idx = ((n * (out_channels * r * r) + in_c) * (height / r) + h) * (width / r) + w; + size_t out_idx = ((n * out_channels + c) * height + oh) * width + ow; + + output[out_idx] = input[in_idx]; +} + +template +__global__ void pixel_shuffle_kernel_strided( + T *output, + const T *input, + size_t batch, + size_t out_channels, + size_t out_height, + size_t out_width, + int r, + ptrdiff_t x_stride0, + ptrdiff_t x_stride1, + ptrdiff_t x_stride2, + ptrdiff_t x_stride3, + ptrdiff_t y_stride0, + ptrdiff_t y_stride1, + ptrdiff_t y_stride2, + ptrdiff_t y_stride3) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = batch * out_channels * out_height * out_width; + + if (idx >= total) return; + + const size_t spatial = out_height * out_width; + const size_t chw = out_channels * spatial; + + size_t n = idx / chw; + size_t rem = idx % chw; + size_t c = rem / spatial; + rem = rem % spatial; + size_t oh = rem / out_width; + size_t ow = rem % out_width; + + const size_t upscale = static_cast(r); + const size_t ih = oh / upscale; + const size_t iw = ow / upscale; + const size_t i = oh % upscale; + const size_t j = ow % upscale; + const size_t in_c = c * upscale * upscale + i * upscale + j; + + const ptrdiff_t in_offset = static_cast(n) * x_stride0 + + static_cast(in_c) * x_stride1 + + static_cast(ih) * x_stride2 + + static_cast(iw) * x_stride3; + const ptrdiff_t out_offset = static_cast(n) * y_stride0 + + static_cast(c) * y_stride1 + + static_cast(oh) * y_stride2 + + static_cast(ow) * y_stride3; + + output[out_offset] = input[in_offset]; +} + +} // namespace op::cuda diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h new file mode 100644 index 000000000..d64dbc961 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h @@ -0,0 +1,57 @@ +#ifndef __PIXEL_SHUFFLE_METAX_H__ +#define __PIXEL_SHUFFLE_METAX_H__ + +#include "../../../operator.h" +#include "../../../devices/metax/metax_common.h" + +namespace op::pixel_shuffle::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t batch; + size_t in_channels; + size_t out_channels; + size_t height; + size_t width; + int upscale_factor; + size_t input_size; + size_t output_size; + + Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels, + size_t height, size_t width, int upscale_factor, + size_t input_size, size_t output_size, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + batch(batch), + in_channels(in_channels), + out_channels(out_channels), + height(height), + width(width), + upscale_factor(upscale_factor), + input_size(input_size), + output_size(output_size) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::pixel_shuffle::metax + +#endif // __PIXEL_SHUFFLE_METAX_H__ diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca new file mode 100644 index 000000000..4c9e5ca78 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca @@ -0,0 +1,113 @@ +#include "pixel_shuffle_metax.h" +#include "../cuda/kernel.cuh" +#include "../../../utils.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include +#include + +namespace op::pixel_shuffle::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + if (upscale_factor <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t batch = x_shape[0]; + size_t in_channels = x_shape[1]; + size_t height = x_shape[2]; + size_t width = x_shape[3]; + + if (in_channels % (upscale_factor * upscale_factor) != 0) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t out_channels = in_channels / (upscale_factor * upscale_factor); + size_t out_height = height * upscale_factor; + size_t out_width = width * upscale_factor; + + std::vector expected_y_shape = {batch, out_channels, out_height, out_width}; + if (y_shape != expected_y_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels, + height, width, upscale_factor, + x_desc->numel(), y_desc->numel(), + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto hc_stream = reinterpret_cast(stream); + constexpr int BLOCK_SIZE = 256; + size_t total = output_size; + if (total == 0) { + return INFINI_STATUS_SUCCESS; + } + int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_BF16: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_F32: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_F64: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::pixel_shuffle::metax diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h new file mode 100644 index 000000000..db1a6db4c --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h @@ -0,0 +1,57 @@ +#ifndef __PIXEL_SHUFFLE_MOORE_H__ +#define __PIXEL_SHUFFLE_MOORE_H__ + +#include "../../../operator.h" +#include "../../../devices/moore/moore_common.h" + +namespace op::pixel_shuffle::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t batch; + size_t in_channels; + size_t out_channels; + size_t height; + size_t width; + int upscale_factor; + size_t input_size; + size_t output_size; + + Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels, + size_t height, size_t width, int upscale_factor, + size_t input_size, size_t output_size, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + batch(batch), + in_channels(in_channels), + out_channels(out_channels), + height(height), + width(width), + upscale_factor(upscale_factor), + input_size(input_size), + output_size(output_size) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::pixel_shuffle::moore + +#endif // __PIXEL_SHUFFLE_MOORE_H__ diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu new file mode 100644 index 000000000..6fb6f9ef2 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu @@ -0,0 +1,113 @@ +#include "pixel_shuffle_moore.h" +#include "../cuda/kernel.cuh" +#include "../../../utils.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include +#include + +namespace op::pixel_shuffle::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + if (upscale_factor <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t batch = x_shape[0]; + size_t in_channels = x_shape[1]; + size_t height = x_shape[2]; + size_t width = x_shape[3]; + + if (in_channels % (upscale_factor * upscale_factor) != 0) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t out_channels = in_channels / (upscale_factor * upscale_factor); + size_t out_height = height * upscale_factor; + size_t out_width = width * upscale_factor; + + std::vector expected_y_shape = {batch, out_channels, out_height, out_width}; + if (y_shape != expected_y_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels, + height, width, upscale_factor, + x_desc->numel(), y_desc->numel(), + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto musa_stream = reinterpret_cast(stream); + constexpr int BLOCK_SIZE = 256; + size_t total = output_size; + if (total == 0) { + return INFINI_STATUS_SUCCESS; + } + int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_BF16: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_F32: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_F64: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::pixel_shuffle::moore diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu new file mode 100644 index 000000000..32b36e226 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu @@ -0,0 +1,129 @@ +#include "pixel_shuffle_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../utils.h" +#include +#include +#include + +namespace op::pixel_shuffle::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + if (y_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (upscale_factor <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 4 || y_shape.size() != 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (x_desc->hasBroadcastDim() || y_desc->hasBroadcastDim()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + size_t batch = x_shape[0]; + size_t in_channels = x_shape[1]; + size_t height = x_shape[2]; + size_t width = x_shape[3]; + + if (in_channels % (upscale_factor * upscale_factor) != 0) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t out_channels = in_channels / (upscale_factor * upscale_factor); + size_t out_height = height * upscale_factor; + size_t out_width = width * upscale_factor; + + std::vector expected_y_shape = {batch, out_channels, out_height, out_width}; + if (y_shape != expected_y_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + std::array x_strides = {x_desc->stride(0), x_desc->stride(1), x_desc->stride(2), x_desc->stride(3)}; + std::array y_strides = {y_desc->stride(0), y_desc->stride(1), y_desc->stride(2), y_desc->stride(3)}; + + *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels, + height, width, upscale_factor, + x_desc->numel(), y_desc->numel(), + x_strides, y_strides, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + constexpr int BLOCK_SIZE = 256; + const size_t out_height = height * static_cast(upscale_factor); + const size_t out_width = width * static_cast(upscale_factor); + const size_t total = output_size; + if (total == 0) { + return INFINI_STATUS_SUCCESS; + } + int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::pixel_shuffle_kernel_strided<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, out_height, out_width, + upscale_factor, + x_strides[0], x_strides[1], x_strides[2], x_strides[3], + y_strides[0], y_strides[1], y_strides[2], y_strides[3]); + break; + case INFINI_DTYPE_BF16: + cuda::pixel_shuffle_kernel_strided<__nv_bfloat16><<>>( + reinterpret_cast<__nv_bfloat16 *>(y), + reinterpret_cast(x), + batch, out_channels, out_height, out_width, + upscale_factor, + x_strides[0], x_strides[1], x_strides[2], x_strides[3], + y_strides[0], y_strides[1], y_strides[2], y_strides[3]); + break; + case INFINI_DTYPE_F32: + cuda::pixel_shuffle_kernel_strided<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, out_height, out_width, + upscale_factor, + x_strides[0], x_strides[1], x_strides[2], x_strides[3], + y_strides[0], y_strides[1], y_strides[2], y_strides[3]); + break; + case INFINI_DTYPE_F64: + cuda::pixel_shuffle_kernel_strided<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, out_height, out_width, + upscale_factor, + x_strides[0], x_strides[1], x_strides[2], x_strides[3], + y_strides[0], y_strides[1], y_strides[2], y_strides[3]); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::pixel_shuffle::nvidia diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh new file mode 100644 index 000000000..1cd155cab --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh @@ -0,0 +1,65 @@ +#ifndef __PIXEL_SHUFFLE_NVIDIA_H__ +#define __PIXEL_SHUFFLE_NVIDIA_H__ + +#include "../../../operator.h" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include +#include + +namespace op::pixel_shuffle::nvidia { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t batch; + size_t in_channels; + size_t out_channels; + size_t height; + size_t width; + int upscale_factor; + size_t input_size; + size_t output_size; + std::array x_strides; + std::array y_strides; + + Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels, + size_t height, size_t width, int upscale_factor, + size_t input_size, size_t output_size, + std::array x_strides, + std::array y_strides, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + batch(batch), + in_channels(in_channels), + out_channels(out_channels), + height(height), + width(width), + upscale_factor(upscale_factor), + input_size(input_size), + output_size(output_size), + x_strides(x_strides), + y_strides(y_strides) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::pixel_shuffle::nvidia + +#endif // __PIXEL_SHUFFLE_NVIDIA_H__ diff --git a/src/infiniop/ops/pixel_shuffle/operator.cc b/src/infiniop/ops/pixel_shuffle/operator.cc new file mode 100644 index 000000000..8147b0b2d --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/operator.cc @@ -0,0 +1,159 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/pixel_shuffle.h" + +#ifdef ENABLE_CPU_API +#include "cpu/pixel_shuffle_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/pixel_shuffle_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/pixel_shuffle_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/pixel_shuffle_moore.h" +#endif + +__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor( + infiniopHandle_t handle, + infiniopPixelShuffleDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::pixel_shuffle::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + upscale_factor) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C __export infiniStatus_t infiniopPixelShuffle( + infiniopPixelShuffleDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C __export infiniStatus_t +infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} From 2647cf913b4a462b99517fdd2f15ff9aa8b11bf7 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Tue, 7 Apr 2026 16:57:04 +0800 Subject: [PATCH 2/2] issue/1031 fix T1-1-41 --- include/infiniop/ops/erf.h | 24 +-- include/infiniop/ops/erfc.h | 24 +-- include/infiniop/ops/erfinv.h | 24 +-- include/infiniop/ops/matrix_power.h | 26 +-- include/infiniop/ops/pixel_shuffle.h | 26 +-- src/infiniop/ops/erf/cuda/kernel.cuh | 5 +- src/infiniop/ops/erf/moore/erf_moore.mu | 10 +- src/infiniop/ops/erf/moore/erf_moore_kernel.h | 36 ---- src/infiniop/ops/erf/operator.cc | 34 ++-- src/infiniop/ops/erfc/cuda/kernel.cuh | 5 +- src/infiniop/ops/erfc/moore/erfc_moore.mu | 10 +- .../ops/erfc/moore/erfc_moore_kernel.h | 36 ---- src/infiniop/ops/erfc/operator.cc | 30 ++-- src/infiniop/ops/erfinv/cpu/erfinv_cpu.h | 22 ++- src/infiniop/ops/erfinv/cuda/kernel.cuh | 85 +++++---- src/infiniop/ops/erfinv/moore/erfinv_moore.mu | 10 +- .../ops/erfinv/moore/erfinv_moore_kernel.h | 72 -------- src/infiniop/ops/erfinv/operator.cc | 14 +- .../ops/matrix_power/cpu/matrix_power_cpu.cc | 4 +- .../ops/matrix_power/cpu/matrix_power_cpu.h | 6 +- .../matrix_power/metax/matrix_power_metax.h | 1 - .../metax/matrix_power_metax.maca | 8 +- .../matrix_power/moore/matrix_power_moore.h | 1 - .../matrix_power/moore/matrix_power_moore.mu | 8 +- .../nvidia/matrix_power_nvidia.cu | 6 +- .../nvidia/matrix_power_nvidia.cuh | 1 - src/infiniop/ops/matrix_power/operator.cc | 40 ++--- .../pixel_shuffle/cpu/pixel_shuffle_cpu.cc | 2 +- .../ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h | 2 +- .../ops/pixel_shuffle/cuda/kernel.cuh | 19 +- .../pixel_shuffle/metax/pixel_shuffle_metax.h | 1 - .../metax/pixel_shuffle_metax.maca | 9 +- .../pixel_shuffle/moore/pixel_shuffle_moore.h | 1 - .../moore/pixel_shuffle_moore.mu | 9 +- .../nvidia/pixel_shuffle_nvidia.cu | 8 +- .../nvidia/pixel_shuffle_nvidia.cuh | 1 - src/infiniop/ops/pixel_shuffle/operator.cc | 38 ++-- test/infiniop/erf.py | 118 +++++++++++++ test/infiniop/erfc.py | 117 +++++++++++++ test/infiniop/erfinv.py | 118 +++++++++++++ test/infiniop/libinfiniop/op_register.py | 162 ++++++++++++++++++ test/infiniop/matrix_power.py | 104 +++++++++++ test/infiniop/pixel_shuffle.py | 114 ++++++++++++ 43 files changed, 993 insertions(+), 398 deletions(-) delete mode 100644 src/infiniop/ops/erf/moore/erf_moore_kernel.h delete mode 100644 src/infiniop/ops/erfc/moore/erfc_moore_kernel.h delete mode 100644 src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h create mode 100644 test/infiniop/erf.py create mode 100644 test/infiniop/erfc.py create mode 100644 test/infiniop/erfinv.py create mode 100644 test/infiniop/matrix_power.py create mode 100644 test/infiniop/pixel_shuffle.py diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h index 8cbb8fb74..8786b31a3 100644 --- a/include/infiniop/ops/erf.h +++ b/include/infiniop/ops/erf.h @@ -5,20 +5,20 @@ typedef struct InfiniopDescriptor *infiniopErfDescriptor_t; -__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle, - infiniopErfDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); +__INFINI_C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); -__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc); #endif diff --git a/include/infiniop/ops/erfc.h b/include/infiniop/ops/erfc.h index 6454573bc..6bceebe35 100644 --- a/include/infiniop/ops/erfc.h +++ b/include/infiniop/ops/erfc.h @@ -5,20 +5,20 @@ typedef struct InfiniopDescriptor *infiniopErfcDescriptor_t; -__C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle, - infiniopErfcDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); +__INFINI_C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle, + infiniopErfcDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); -__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc); #endif diff --git a/include/infiniop/ops/erfinv.h b/include/infiniop/ops/erfinv.h index 79bc09f22..b14975253 100644 --- a/include/infiniop/ops/erfinv.h +++ b/include/infiniop/ops/erfinv.h @@ -5,20 +5,20 @@ typedef struct InfiniopDescriptor *infiniopErfinvDescriptor_t; -__C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle, - infiniopErfinvDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); +__INFINI_C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle, + infiniopErfinvDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); -__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc); #endif diff --git a/include/infiniop/ops/matrix_power.h b/include/infiniop/ops/matrix_power.h index acd7c0c7e..639d3cf02 100644 --- a/include/infiniop/ops/matrix_power.h +++ b/include/infiniop/ops/matrix_power.h @@ -5,21 +5,21 @@ typedef struct InfiniopDescriptor *infiniopMatrixPowerDescriptor_t; -__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle, - infiniopMatrixPowerDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x, - int n); +__INFINI_C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle, + infiniopMatrixPowerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int n); -__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc); #endif diff --git a/include/infiniop/ops/pixel_shuffle.h b/include/infiniop/ops/pixel_shuffle.h index 941a91cfc..fac8eff2c 100644 --- a/include/infiniop/ops/pixel_shuffle.h +++ b/include/infiniop/ops/pixel_shuffle.h @@ -5,21 +5,21 @@ typedef struct InfiniopDescriptor *infiniopPixelShuffleDescriptor_t; -__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle, - infiniopPixelShuffleDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x, - int upscale_factor); +__INFINI_C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle, + infiniopPixelShuffleDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int upscale_factor); -__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size); +__INFINI_C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__INFINI_C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc); +__INFINI_C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc); #endif diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh index 9bd6cff21..8efe037e3 100644 --- a/src/infiniop/ops/erf/cuda/kernel.cuh +++ b/src/infiniop/ops/erf/cuda/kernel.cuh @@ -1,8 +1,5 @@ #pragma once #include -#include -#include -#include #include namespace op::cuda { @@ -15,7 +12,7 @@ struct ErfOp { if constexpr (std::is_same_v) { return erff(x); } else if constexpr (std::is_same_v) { - return erf(x); + return ::erf(x); } else { // For F16/BF16: promote to float, compute, then cast back float xf; diff --git a/src/infiniop/ops/erf/moore/erf_moore.mu b/src/infiniop/ops/erf/moore/erf_moore.mu index 1f717fa51..856764c70 100644 --- a/src/infiniop/ops/erf/moore/erf_moore.mu +++ b/src/infiniop/ops/erf/moore/erf_moore.mu @@ -2,7 +2,7 @@ #include "../../../elementwise/moore/elementwise_moore.h" -#include "erf_moore_kernel.h" +#include "../cuda/kernel.cuh" namespace op::erf::moore { @@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_BF16: - return _device_info->calculate<256, moore::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F16: - return _device_info->calculate<256, moore::ErfOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, moore::ErfOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, moore::ErfOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/erf/moore/erf_moore_kernel.h b/src/infiniop/ops/erf/moore/erf_moore_kernel.h deleted file mode 100644 index 8ddc9d5f1..000000000 --- a/src/infiniop/ops/erf/moore/erf_moore_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef __ERF_MOORE_KERNEL_H__ -#define __ERF_MOORE_KERNEL_H__ - -#include -#include -#include -#include - -namespace op::erf::moore { - -typedef struct ErfOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - float x0 = __low2float(x); - float x1 = __high2float(x); - return __floats2half2_rn(erff(x0), erff(x1)); - } else if constexpr (std::is_same_v) { - float xf = __half2float(x); - return __float2half(erff(xf)); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(x); - return __float2bfloat16_rn(erff(xf)); - } else if constexpr (std::is_same_v) { - return erff(x); - } else { // double - return erf(x); - } - } -} ErfOp; - -} // namespace op::erf::moore - -#endif // __ERF_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc index f9b61e981..7ed264e33 100644 --- a/src/infiniop/ops/erf/operator.cc +++ b/src/infiniop/ops/erf/operator.cc @@ -15,18 +15,18 @@ #include "moore/erf_moore.h" #endif -__C __export infiniStatus_t infiniopCreateErfDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateErfDescriptor( infiniopHandle_t handle, infiniopErfDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::erf::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erf::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ {x_desc}) switch (handle->device) { @@ -54,10 +54,10 @@ __C __export infiniStatus_t infiniopCreateErfDescriptor( #undef CREATE } -__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) { +__INFINI_C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -85,7 +85,7 @@ __C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C __export infiniStatus_t infiniopErf( +__INFINI_C __export infiniStatus_t infiniopErf( infiniopErfDescriptor_t desc, void *workspace, size_t workspace_size, @@ -93,9 +93,9 @@ __C __export infiniStatus_t infiniopErf( const void *x, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, y, {x}, stream) switch (desc->device_type) { @@ -123,11 +123,11 @@ __C __export infiniStatus_t infiniopErf( #undef CALCULATE } -__C __export infiniStatus_t +__INFINI_C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; diff --git a/src/infiniop/ops/erfc/cuda/kernel.cuh b/src/infiniop/ops/erfc/cuda/kernel.cuh index aae8efcee..5a448919d 100644 --- a/src/infiniop/ops/erfc/cuda/kernel.cuh +++ b/src/infiniop/ops/erfc/cuda/kernel.cuh @@ -1,8 +1,5 @@ #pragma once #include -#include -#include -#include #include namespace op::cuda { @@ -15,7 +12,7 @@ struct ErfcOp { if constexpr (std::is_same_v) { return erfcf(x); } else if constexpr (std::is_same_v) { - return erfc(x); + return ::erfc(x); } else { // For F16/BF16: promote to float, compute, then cast back float xf; diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.mu b/src/infiniop/ops/erfc/moore/erfc_moore.mu index d1eaec1bf..d58243e50 100644 --- a/src/infiniop/ops/erfc/moore/erfc_moore.mu +++ b/src/infiniop/ops/erfc/moore/erfc_moore.mu @@ -2,7 +2,7 @@ #include "../../../elementwise/moore/elementwise_moore.h" -#include "erfc_moore_kernel.h" +#include "../cuda/kernel.cuh" namespace op::erfc::moore { @@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_BF16: - return _device_info->calculate<256, moore::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F16: - return _device_info->calculate<256, moore::ErfcOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, moore::ErfcOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, moore::ErfcOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h b/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h deleted file mode 100644 index cd5225c3b..000000000 --- a/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef __ERFC_MOORE_KERNEL_H__ -#define __ERFC_MOORE_KERNEL_H__ - -#include -#include -#include -#include - -namespace op::erfc::moore { - -typedef struct ErfcOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - float x0 = __low2float(x); - float x1 = __high2float(x); - return __floats2half2_rn(erfcf(x0), erfcf(x1)); - } else if constexpr (std::is_same_v) { - float xf = __half2float(x); - return __float2half(erfcf(xf)); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(x); - return __float2bfloat16_rn(erfcf(xf)); - } else if constexpr (std::is_same_v) { - return erfcf(x); - } else { // double - return erfc(x); - } - } -} ErfcOp; - -} // namespace op::erfc::moore - -#endif // __ERFC_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/erfc/operator.cc b/src/infiniop/ops/erfc/operator.cc index fa102c90c..ebd692349 100644 --- a/src/infiniop/ops/erfc/operator.cc +++ b/src/infiniop/ops/erfc/operator.cc @@ -15,18 +15,18 @@ #include "moore/erfc_moore.h" #endif -__C __export infiniStatus_t infiniopCreateErfcDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateErfcDescriptor( infiniopHandle_t handle, infiniopErfcDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ return op::erfc::NAMESPACE::Descriptor::create( \ - handle, \ + handle, \ reinterpret_cast(desc_ptr), \ - y_desc, \ + y_desc, \ {x_desc}) switch (handle->device) { @@ -54,10 +54,10 @@ __C __export infiniStatus_t infiniopCreateErfcDescriptor( #undef CREATE } -__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size) { +__INFINI_C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -85,7 +85,7 @@ __C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C __export infiniStatus_t infiniopErfc( +__INFINI_C __export infiniStatus_t infiniopErfc( infiniopErfcDescriptor_t desc, void *workspace, size_t workspace_size, @@ -93,9 +93,9 @@ __C __export infiniStatus_t infiniopErfc( const void *x, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, y, {x}, stream) switch (desc->device_type) { @@ -123,11 +123,11 @@ __C __export infiniStatus_t infiniopErfc( #undef CALCULATE } -__C __export infiniStatus_t +__INFINI_C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h index 8cc218a1d..e7b1b723d 100644 --- a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h +++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h @@ -13,10 +13,18 @@ namespace op::erfinv::cpu { template T erfinv_impl(T x) { // Domain: x in (-1, 1) - if (x == 1.0) return std::numeric_limits::infinity(); - if (x == -1.0) return -std::numeric_limits::infinity(); - if (x > 1.0 || x < -1.0) return std::numeric_limits::quiet_NaN(); - if (x == 0.0) return 0.0; + if (x == 1.0) { + return std::numeric_limits::infinity(); + } + if (x == -1.0) { + return -std::numeric_limits::infinity(); + } + if (x > 1.0 || x < -1.0) { + return std::numeric_limits::quiet_NaN(); + } + if (x == 0.0) { + return 0.0; + } // Use Newton's method to solve erf(y) = x T y = x; // Initial guess @@ -25,9 +33,11 @@ T erfinv_impl(T x) { for (int i = 0; i < max_iter; ++i) { T erf_y = std::erf(y); - T derf_dy = 2.0 / std::sqrt(3.14159265358979323846) * std::exp(-y * y); + T derf_dy = T(2.0) / T(std::sqrt(3.14159265358979323846) * std::exp(-y * y)); T error = erf_y - x; - if (std::abs(error) < tol) break; + if (std::abs(error) < tol) { + break; + } y = y - error / derf_dy; } return y; diff --git a/src/infiniop/ops/erfinv/cuda/kernel.cuh b/src/infiniop/ops/erfinv/cuda/kernel.cuh index 4f4660e80..18025331a 100644 --- a/src/infiniop/ops/erfinv/cuda/kernel.cuh +++ b/src/infiniop/ops/erfinv/cuda/kernel.cuh @@ -1,37 +1,41 @@ #pragma once #include -#include -#include -#include -#include +#include #include namespace op::cuda { +constexpr float PI_F = 3.14159265358979323846f; +constexpr double PI = 3.14159265358979323846; + // Inverse error function. -// -// We use a Winitzki-style approximation for an initial guess, then refine with -// a few Newton iterations. Starting with y=x converges poorly for x close to 1, -// which appears frequently in test inputs (torch.rand in [0,1)). __device__ __forceinline__ float erfinv_impl(float x) { - if (x == 1.0f) return CUDART_INF_F; - if (x == -1.0f) return -CUDART_INF_F; - if (x > 1.0f || x < -1.0f) return CUDART_NAN_F; - if (x == 0.0f) return 0.0f; + if (x == 1.0f) { + return std::numeric_limits::infinity(); + } + if (x == -1.0f) { + return -std::numeric_limits::infinity(); + } + if (x > 1.0f || x < -1.0f) { + return std::numeric_limits::quiet_NaN(); + } + if (x == 0.0f) { + return 0.0f; + } - // Winitzki approximation (a = 0.147) for initial guess. - // See: https://arxiv.org/abs/math/0306301 (and common implementations). + // Winitzki approximation const float a = 0.147f; - const float ln = log1pf(-x * x); // ln(1 - x^2) <= 0 - const float t = 2.0f / (CUDART_PI_F * a) + ln * 0.5f; + const float ln = log1pf(-x * x); + const float t = 2.0f / (PI_F * a) + ln * 0.5f; + float inside = t * t - ln / a; inside = inside > 0.0f ? inside : 0.0f; + float y0 = copysignf(sqrtf(sqrtf(inside) - t), x); - // Fast path: a few Newton steps in float. - // This is sufficient for most x and much faster than always refining in double. float y = y0; - const float sqrt_pi_f = 1.7724538509055159f; // sqrt(pi) + const float sqrt_pi_f = sqrtf(PI_F); + #pragma unroll for (int i = 0; i < 4; ++i) { const float erf_y = erff(y); @@ -39,17 +43,13 @@ __device__ __forceinline__ float erfinv_impl(float x) { y = y - (erf_y - x) / derf_dy; } - // Hybrid slow path: only for values extremely close to ±1 where float erf - // quantization can cause Newton iterations to stagnate, leading to noticeable - // absolute error in y (even if erff(y) == x in float). - // - // The threshold is chosen so the slow path is taken very rarely for typical - // random inputs, minimizing warp divergence and preserving performance. + // Slow path near |x| ~ 1 const float ax = fabsf(x); if (1.0f - ax < 1e-4f) { const double xd = static_cast(x); double yd = static_cast(y); - const double sqrt_pi = 1.7724538509055159; // sqrt(pi) + const double sqrt_pi = sqrt(PI); + #pragma unroll for (int i = 0; i < 4; ++i) { const double erf_y = erf(yd); @@ -69,32 +69,47 @@ struct ErfinvOp { __device__ __forceinline__ T operator()(T x) const { if constexpr (std::is_same_v) { return erfinv_impl(x); + } else if constexpr (std::is_same_v) { - // For double, use similar approach - if (x == 1.0) return CUDART_INF; - if (x == -1.0) return -CUDART_INF; - if (x > 1.0 || x < -1.0) return CUDART_NAN; - if (x == 0.0) return 0.0; + if (x == 1.0) { + return std::numeric_limits::infinity(); + } + if (x == -1.0) { + return -std::numeric_limits::infinity(); + } + if (x > 1.0 || x < -1.0) { + return std::numeric_limits::quiet_NaN(); + } + if (x == 0.0) { + return 0.0; + } + const double a = 0.147; const double ln = log1p(-x * x); - const double t = 2.0 / (CUDART_PI * a) + ln * 0.5; + const double t = 2.0 / (PI * a) + ln * 0.5; + double inside = t * t - ln / a; inside = inside > 0.0 ? inside : 0.0; + double y = copysign(sqrt(sqrt(inside) - t), x); const int max_iter = 30; const double tol = 1e-14; - const double sqrt_pi = 1.7724538509055159; + const double sqrt_pi = sqrt(PI); + for (int i = 0; i < max_iter; ++i) { const double erf_y = erf(y); const double error = erf_y - x; - if (fabs(error) < tol) break; + if (fabs(error) < tol) { + break; + } const double derf_dy = 2.0 / sqrt_pi * exp(-y * y); y = y - error / derf_dy; } return y; + } else { - // For F16/BF16: promote to float, compute, then cast back + // F16 / BF16 / other types float xf; if constexpr (std::is_same_v) { xf = __half2float(x); diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.mu b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu index 54b5830ea..26febe69d 100644 --- a/src/infiniop/ops/erfinv/moore/erfinv_moore.mu +++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu @@ -2,7 +2,7 @@ #include "../../../elementwise/moore/elementwise_moore.h" -#include "erfinv_moore_kernel.h" +#include "../cuda/kernel.cuh" namespace op::erfinv::moore { @@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_BF16: - return _device_info->calculate<256, moore::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F16: - return _device_info->calculate<256, moore::ErfinvOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, moore::ErfinvOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, moore::ErfinvOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h b/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h deleted file mode 100644 index e3f3bb5f2..000000000 --- a/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef __ERFINV_MOORE_KERNEL_H__ -#define __ERFINV_MOORE_KERNEL_H__ - -#include -#include -#include -#include - -namespace op::erfinv::moore { - -// Inverse error function using Newton's method -template -__device__ __forceinline__ T erfinv_impl(T x) { - if (x >= 1.0f) return CUDART_INF_F; - if (x <= -1.0f) return -CUDART_INF_F; - if (x == 0.0f) return 0.0f; - - T y = x; - const int max_iter = 10; - const T tol = 1e-10f; - const T sqrt_pi = 1.7724538509055159f; - - for (int i = 0; i < max_iter; ++i) { - T erf_y = erff(y); - T derf_dy = 2.0f / sqrt_pi * expf(-y * y); - T error = erf_y - x; - if (fabsf(error) < tol) break; - y = y - error / derf_dy; - } - return y; -} - -typedef struct ErfinvOp { -public: - static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &x) const { - if constexpr (std::is_same_v) { - float x0 = __low2float(x); - float x1 = __high2float(x); - return __floats2half2_rn(erfinv_impl(x0), erfinv_impl(x1)); - } else if constexpr (std::is_same_v) { - float xf = __half2float(x); - return __float2half(erfinv_impl(xf)); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(x); - return __float2bfloat16_rn(erfinv_impl(xf)); - } else if constexpr (std::is_same_v) { - return erfinv_impl(x); - } else { // double - if (x >= 1.0) return CUDART_INF; - if (x <= -1.0) return -CUDART_INF; - if (x == 0.0) return 0.0; - double y = x; - const int max_iter = 10; - const double tol = 1e-10; - const double sqrt_pi = 1.7724538509055159; - for (int i = 0; i < max_iter; ++i) { - double erf_y = erf(y); - double derf_dy = 2.0 / sqrt_pi * exp(-y * y); - double error = erf_y - x; - if (fabs(error) < tol) break; - y = y - error / derf_dy; - } - return y; - } - } -} ErfinvOp; - -} // namespace op::erfinv::moore - -#endif // __ERFINV_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/erfinv/operator.cc b/src/infiniop/ops/erfinv/operator.cc index c7c360bec..c0453562b 100644 --- a/src/infiniop/ops/erfinv/operator.cc +++ b/src/infiniop/ops/erfinv/operator.cc @@ -15,7 +15,7 @@ #include "moore/erfinv_moore.h" #endif -__C __export infiniStatus_t infiniopCreateErfinvDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateErfinvDescriptor( infiniopHandle_t handle, infiniopErfinvDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, @@ -23,7 +23,7 @@ __C __export infiniStatus_t infiniopCreateErfinvDescriptor( #define CREATE(CASE, NAMESPACE) \ case CASE: \ - return op::erfinv::NAMESPACE::Descriptor::create( \ + return op::erfinv::NAMESPACE::Descriptor::create( \ handle, \ reinterpret_cast(desc_ptr), \ y_desc, \ @@ -54,10 +54,10 @@ __C __export infiniStatus_t infiniopCreateErfinvDescriptor( #undef CREATE } -__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size) { +__INFINI_C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -85,7 +85,7 @@ __C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescrip return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C __export infiniStatus_t infiniopErfinv( +__INFINI_C __export infiniStatus_t infiniopErfinv( infiniopErfinvDescriptor_t desc, void *workspace, size_t workspace_size, @@ -123,7 +123,7 @@ __C __export infiniStatus_t infiniopErfinv( #undef CALCULATE } -__C __export infiniStatus_t +__INFINI_C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc index 22a792012..6c2fe60cd 100644 --- a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc +++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc @@ -1,7 +1,7 @@ #include "matrix_power_cpu.h" -#include "../../../utils.h" -#include +#include "../../../tensor.h" #include +#include namespace op::matrix_power::cpu { diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h index 9c6f2ebc1..06a14c341 100644 --- a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h +++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h @@ -1,15 +1,15 @@ #ifndef __MATRIX_POWER_CPU_H__ #define __MATRIX_POWER_CPU_H__ -#include "../../../operator.h" #include "../../../devices/cpu/common_cpu.h" +#include "../../../operator.h" #include namespace op::matrix_power::cpu { struct MatrixPowerInfo { - size_t matrix_size; // N x N matrix - size_t n; // Power + size_t matrix_size; // N x N matrix + size_t n; // Power size_t input_size; size_t output_size; diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h index 02ff4b14d..43e561f66 100644 --- a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h +++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h @@ -2,7 +2,6 @@ #define __MATRIX_POWER_METAX_H__ #include "../../../operator.h" -#include "../../../devices/metax/metax_common.h" namespace op::matrix_power::metax { diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca index 0e7f58f19..221757141 100644 --- a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca +++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca @@ -1,9 +1,9 @@ -#include "matrix_power_metax.h" -#include "../../../utils.h" +#include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_kernel_common.h" -#include -#include +#include "../../../tensor.h" +#include "matrix_power_metax.h" #include +#include namespace op::matrix_power::metax { diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h index a58428a6e..42dd5c180 100644 --- a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h +++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h @@ -2,7 +2,6 @@ #define __MATRIX_POWER_MOORE_H__ #include "../../../operator.h" -#include "../../../devices/moore/moore_common.h" namespace op::matrix_power::moore { diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu index 532480955..938ad5c0a 100644 --- a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu +++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu @@ -1,9 +1,9 @@ -#include "matrix_power_moore.h" -#include "../../../utils.h" +#include "../../../devices/moore/moore_common.h" #include "../../../devices/moore/moore_kernel_common.h" -#include -#include +#include "../../../tensor.h" +#include "matrix_power_moore.h" #include +#include namespace op::matrix_power::moore { diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu index e6164c8ca..d728238c1 100644 --- a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu +++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu @@ -1,9 +1,7 @@ -#include "matrix_power_nvidia.cuh" -#include "../../../utils.h" #include "../../../devices/nvidia/nvidia_handle.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include -#include +#include "../../../tensor.h" +#include "matrix_power_nvidia.cuh" #include #include diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh index ebe0ecbdf..d11c6914e 100644 --- a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh +++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh @@ -2,7 +2,6 @@ #define __MATRIX_POWER_NVIDIA_H__ #include "../../../operator.h" -#include "../../../devices/nvidia/nvidia_common.cuh" #include namespace op::matrix_power::nvidia { diff --git a/src/infiniop/ops/matrix_power/operator.cc b/src/infiniop/ops/matrix_power/operator.cc index d26e26fd1..79a81077a 100644 --- a/src/infiniop/ops/matrix_power/operator.cc +++ b/src/infiniop/ops/matrix_power/operator.cc @@ -15,20 +15,20 @@ #include "moore/matrix_power_moore.h" #endif -__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor( infiniopHandle_t handle, infiniopMatrixPowerDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, int n) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::matrix_power::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - x_desc, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::matrix_power::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ n) switch (handle->device) { @@ -56,11 +56,11 @@ __C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor( #undef CREATE } -__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size) { +__INFINI_C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; switch (desc->device_type) { @@ -87,7 +87,7 @@ __C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPo return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C __export infiniStatus_t infiniopMatrixPower( +__INFINI_C __export infiniStatus_t infiniopMatrixPower( infiniopMatrixPowerDescriptor_t desc, void *workspace, size_t workspace_size, @@ -95,9 +95,9 @@ __C __export infiniStatus_t infiniopMatrixPower( const void *x, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, y, x, stream) switch (desc->device_type) { @@ -125,12 +125,12 @@ __C __export infiniStatus_t infiniopMatrixPower( #undef CALCULATE } -__C __export infiniStatus_t +__INFINI_C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; switch (desc->device_type) { diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc index 19c1de74b..15d0d6906 100644 --- a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc +++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc @@ -1,5 +1,5 @@ #include "pixel_shuffle_cpu.h" -#include "../../../utils.h" +#include "../../../tensor.h" namespace op::pixel_shuffle::cpu { diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h index d4a1a2b46..2fa5e00fe 100644 --- a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h +++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h @@ -1,8 +1,8 @@ #ifndef __PIXEL_SHUFFLE_CPU_H__ #define __PIXEL_SHUFFLE_CPU_H__ -#include "../../../operator.h" #include "../../../devices/cpu/common_cpu.h" +#include "../../../operator.h" #include namespace op::pixel_shuffle::cpu { diff --git a/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh index 758b2ba7a..874b29cce 100644 --- a/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh +++ b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh @@ -1,6 +1,5 @@ #pragma once #include -#include #include namespace op::cuda { @@ -18,7 +17,9 @@ __global__ void pixel_shuffle_kernel( size_t idx = blockIdx.x * blockDim.x + threadIdx.x; size_t total = batch * out_channels * height * width; - if (idx >= total) return; + if (idx >= total) { + return; + } size_t n = idx / (out_channels * height * width); size_t rem = idx % (out_channels * height * width); @@ -61,7 +62,9 @@ __global__ void pixel_shuffle_kernel_strided( size_t idx = blockIdx.x * blockDim.x + threadIdx.x; size_t total = batch * out_channels * out_height * out_width; - if (idx >= total) return; + if (idx >= total) { + return; + } const size_t spatial = out_height * out_width; const size_t chw = out_channels * spatial; @@ -80,14 +83,8 @@ __global__ void pixel_shuffle_kernel_strided( const size_t j = ow % upscale; const size_t in_c = c * upscale * upscale + i * upscale + j; - const ptrdiff_t in_offset = static_cast(n) * x_stride0 + - static_cast(in_c) * x_stride1 + - static_cast(ih) * x_stride2 + - static_cast(iw) * x_stride3; - const ptrdiff_t out_offset = static_cast(n) * y_stride0 + - static_cast(c) * y_stride1 + - static_cast(oh) * y_stride2 + - static_cast(ow) * y_stride3; + const ptrdiff_t in_offset = static_cast(n) * x_stride0 + static_cast(in_c) * x_stride1 + static_cast(ih) * x_stride2 + static_cast(iw) * x_stride3; + const ptrdiff_t out_offset = static_cast(n) * y_stride0 + static_cast(c) * y_stride1 + static_cast(oh) * y_stride2 + static_cast(ow) * y_stride3; output[out_offset] = input[in_offset]; } diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h index d64dbc961..34f216655 100644 --- a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h +++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h @@ -2,7 +2,6 @@ #define __PIXEL_SHUFFLE_METAX_H__ #include "../../../operator.h" -#include "../../../devices/metax/metax_common.h" namespace op::pixel_shuffle::metax { diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca index 4c9e5ca78..3c8afd6f2 100644 --- a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca +++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca @@ -1,9 +1,8 @@ -#include "pixel_shuffle_metax.h" -#include "../cuda/kernel.cuh" -#include "../../../utils.h" +#include "../../../devices/metax/metax_common.h" #include "../../../devices/metax/metax_kernel_common.h" -#include -#include +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" +#include "pixel_shuffle_metax.h" namespace op::pixel_shuffle::metax { diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h index db1a6db4c..e47a3edb2 100644 --- a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h +++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h @@ -2,7 +2,6 @@ #define __PIXEL_SHUFFLE_MOORE_H__ #include "../../../operator.h" -#include "../../../devices/moore/moore_common.h" namespace op::pixel_shuffle::moore { diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu index 6fb6f9ef2..5d2c113d6 100644 --- a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu +++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu @@ -1,9 +1,8 @@ -#include "pixel_shuffle_moore.h" -#include "../cuda/kernel.cuh" -#include "../../../utils.h" +#include "../../../devices/moore/moore_common.h" #include "../../../devices/moore/moore_kernel_common.h" -#include -#include +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" +#include "pixel_shuffle_moore.h" namespace op::pixel_shuffle::moore { diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu index 32b36e226..af2d006ae 100644 --- a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu +++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu @@ -1,8 +1,8 @@ -#include "pixel_shuffle_nvidia.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../tensor.h" #include "../cuda/kernel.cuh" -#include "../../../utils.h" -#include -#include +#include "pixel_shuffle_nvidia.cuh" #include namespace op::pixel_shuffle::nvidia { diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh index 1cd155cab..a5c3b7506 100644 --- a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh +++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh @@ -2,7 +2,6 @@ #define __PIXEL_SHUFFLE_NVIDIA_H__ #include "../../../operator.h" -#include "../../../devices/nvidia/nvidia_common.cuh" #include #include diff --git a/src/infiniop/ops/pixel_shuffle/operator.cc b/src/infiniop/ops/pixel_shuffle/operator.cc index 8147b0b2d..3092787ad 100644 --- a/src/infiniop/ops/pixel_shuffle/operator.cc +++ b/src/infiniop/ops/pixel_shuffle/operator.cc @@ -15,20 +15,20 @@ #include "moore/pixel_shuffle_moore.h" #endif -__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor( +__INFINI_C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor( infiniopHandle_t handle, infiniopPixelShuffleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, int upscale_factor) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::pixel_shuffle::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - x_desc, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::pixel_shuffle::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ upscale_factor) switch (handle->device) { @@ -56,10 +56,10 @@ __C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor( #undef CREATE } -__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size) { +__INFINI_C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -87,7 +87,7 @@ __C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelSh return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C __export infiniStatus_t infiniopPixelShuffle( +__INFINI_C __export infiniStatus_t infiniopPixelShuffle( infiniopPixelShuffleDescriptor_t desc, void *workspace, size_t workspace_size, @@ -95,9 +95,9 @@ __C __export infiniStatus_t infiniopPixelShuffle( const void *x, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc) \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, y, x, stream) switch (desc->device_type) { @@ -125,12 +125,12 @@ __C __export infiniStatus_t infiniopPixelShuffle( #undef CALCULATE } -__C __export infiniStatus_t +__INFINI_C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; switch (desc->device_type) { diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py new file mode 100644 index 000000000..733376b97 --- /dev/null +++ b/test/infiniop/erf.py @@ -0,0 +1,118 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import * +from enum import Enum, auto +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +_TEST_CASES_ = [ + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4, 4), None, None), +] + +_TEST_CASES = _TEST_CASES_ +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + dtype=torch.float16, + sync=None, +): + print( + f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + input = TestTensor(shape, input_stride, dtype, device) + output = TestTensor(shape, output_stride, dtype, device) + + output.update_torch_tensor(torch.erf(input.torch_tensor())) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateErfDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor + ) + ) + + input.destroy_desc() + output.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetErfWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopErf( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.erf(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/erfc.py b/test/infiniop/erfc.py new file mode 100644 index 000000000..b2424496d --- /dev/null +++ b/test/infiniop/erfc.py @@ -0,0 +1,117 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import * +from enum import Enum, auto +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +_TEST_CASES_ = [ + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4, 4), None, None), +] + +_TEST_CASES = _TEST_CASES_ +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + dtype=torch.float16, + sync=None, +): + print( + f"Testing Erfc on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + input = TestTensor(shape, input_stride, dtype, device) + output = TestTensor(shape, output_stride, dtype, device) + + output.update_torch_tensor(torch.erfc(input.torch_tensor())) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateErfcDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor + ) + ) + + input.destroy_desc() + output.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetErfcWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopErfc( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.erfc(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyErfcDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/erfinv.py b/test/infiniop/erfinv.py new file mode 100644 index 000000000..1979ee9c1 --- /dev/null +++ b/test/infiniop/erfinv.py @@ -0,0 +1,118 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import * +from enum import Enum, auto +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +_TEST_CASES_ = [ + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4, 4), None, None), +] + +_TEST_CASES = _TEST_CASES_ +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + dtype=torch.float16, + sync=None, +): + print( + f"Testing Erfinv on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + input = TestTensor(shape, input_stride, dtype, device, scale=1.8, bias=-0.9) + + output = TestTensor(shape, output_stride, dtype, device) + output.update_torch_tensor(torch.erfinv(input.torch_tensor())) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateErfinvDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor + ) + ) + + input.destroy_desc() + output.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetErfinvWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopErfinv( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.erfinv(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyErfinvDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index cd3ad1b82..3675ca8d1 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -1753,3 +1753,165 @@ def silu_and_mul(lib): lib.infiniopDestroySiluAndMulDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + + +@OpRegister.operator +def erf(lib): + lib.infiniopCreateErfDescriptor.restype = c_int32 + lib.infiniopCreateErfDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetErfWorkspaceSize.restype = c_int32 + lib.infiniopGetErfWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopErf.restype = c_int32 + lib.infiniopErf.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyErfDescriptor.restype = c_int32 + lib.infiniopDestroyErfDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def erfc(lib): + lib.infiniopCreateErfcDescriptor.restype = c_int32 + lib.infiniopCreateErfcDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetErfcWorkspaceSize.restype = c_int32 + lib.infiniopGetErfcWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopErfc.restype = c_int32 + lib.infiniopErfc.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyErfcDescriptor.restype = c_int32 + lib.infiniopDestroyErfcDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def erfinv(lib): + lib.infiniopCreateErfinvDescriptor.restype = c_int32 + lib.infiniopCreateErfinvDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetErfinvWorkspaceSize.restype = c_int32 + lib.infiniopGetErfinvWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopErfinv.restype = c_int32 + lib.infiniopErfinv.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyErfinvDescriptor.restype = c_int32 + lib.infiniopDestroyErfinvDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def matrix_power(lib): + lib.infiniopCreateMatrixPowerDescriptor.restype = c_int32 + lib.infiniopCreateMatrixPowerDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int32, + ] + + lib.infiniopGetMatrixPowerWorkspaceSize.restype = c_int32 + lib.infiniopGetMatrixPowerWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMatrixPower.restype = c_int32 + lib.infiniopMatrixPower.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMatrixPowerDescriptor.restype = c_int32 + lib.infiniopDestroyMatrixPowerDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def pixel_shuffle(lib): + lib.infiniopCreatePixelShuffleDescriptor.restype = c_int32 + lib.infiniopCreatePixelShuffleDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int32, + ] + + lib.infiniopGetPixelShuffleWorkspaceSize.restype = c_int32 + lib.infiniopGetPixelShuffleWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopPixelShuffle.restype = c_int32 + lib.infiniopPixelShuffle.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyPixelShuffleDescriptor.restype = c_int32 + lib.infiniopDestroyPixelShuffleDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/matrix_power.py b/test/infiniop/matrix_power.py new file mode 100644 index 000000000..7527fe249 --- /dev/null +++ b/test/infiniop/matrix_power.py @@ -0,0 +1,104 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +_TEST_CASES_ = [ + ((4, 4), None, None), + ((8, 8), None, None), +] + +_POWERS = [2, 3, 4] + +_TEST_CASES = [test_case + (n,) for test_case in _TEST_CASES_ for n in _POWERS] + +_TENSOR_DTYPES = [InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test(handle, device, shape, input_stride, output_stride, n, dtype, sync=None): + print( + f"Testing MatrixPower on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} power:{n} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + if shape[-1] != shape[-2]: + return + + input = TestTensor(shape, input_stride, dtype, device) + output = TestTensor(shape, output_stride, dtype, device) + + output.update_torch_tensor(torch.linalg.matrix_power(input.torch_tensor(), n)) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMatrixPowerDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor, n + ) + ) + + input.destroy_desc() + output.destroy_desc() + + ws = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMatrixPowerWorkspaceSize(descriptor, ctypes.byref(ws)) + ) + workspace = TestWorkspace(ws.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopMatrixPower( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.linalg.matrix_power(input.torch_tensor(), n), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyMatrixPowerDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/pixel_shuffle.py b/test/infiniop/pixel_shuffle.py new file mode 100644 index 000000000..2fb690fe5 --- /dev/null +++ b/test/infiniop/pixel_shuffle.py @@ -0,0 +1,114 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + + +_TEST_CASES = [ + ((1, 4, 8, 8), None, None, 2), + ((2, 16, 4, 4), None, None, 2), + ((2, 16, 4, 4), None, None, 4), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test(handle, device, shape, input_stride, output_stride, upscale, dtype, sync=None): + print( + f"Testing PixelShuffle on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} upscale:{upscale} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + r = upscale + assert shape[1] % (r * r) == 0 + + input = TestTensor(shape, input_stride, dtype, device) + + output_shape = ( + shape[0], + shape[1] // (r * r), + shape[2] * r, + shape[3] * r, + ) + + output = TestTensor(output_shape, output_stride, dtype, device) + + output.update_torch_tensor( + torch.nn.functional.pixel_shuffle(input.torch_tensor(), r) + ) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreatePixelShuffleDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor, r + ) + ) + + input.destroy_desc() + output.destroy_desc() + + ws = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetPixelShuffleWorkspaceSize(descriptor, ctypes.byref(ws)) + ) + workspace = TestWorkspace(ws.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopPixelShuffle( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.nn.functional.pixel_shuffle(input.torch_tensor(), r), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyPixelShuffleDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m")