diff --git a/include/infiniop.h b/include/infiniop.h index fe9537876..104476037 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -30,6 +30,9 @@ #include "infiniop/ops/dequantize_gptq.h" #include "infiniop/ops/embedding.h" #include "infiniop/ops/equal.h" +#include "infiniop/ops/erf.h" +#include "infiniop/ops/erfc.h" +#include "infiniop/ops/erfinv.h" #include "infiniop/ops/flash_attention.h" #include "infiniop/ops/flipud.h" #include "infiniop/ops/float_power.h" @@ -58,12 +61,14 @@ #include "infiniop/ops/logcumsumexp.h" #include "infiniop/ops/lp_norm.h" #include "infiniop/ops/masked_select.h" +#include "infiniop/ops/matrix_power.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/multi_margin_loss.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/paged_attention.h" #include "infiniop/ops/paged_attention_prefill.h" #include "infiniop/ops/paged_caching.h" +#include "infiniop/ops/pixel_shuffle.h" #include "infiniop/ops/quant/per_channel_quant_int8.h" #include "infiniop/ops/quant/per_tensor_quant_int8.h" #include "infiniop/ops/random_sample.h" diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h new file mode 100644 index 000000000..8786b31a3 --- /dev/null +++ b/include/infiniop/ops/erf.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ERF_API_H__ +#define __INFINIOP_ERF_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopErfDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__INFINI_C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/erfc.h b/include/infiniop/ops/erfc.h new file mode 100644 index 000000000..6bceebe35 --- /dev/null +++ b/include/infiniop/ops/erfc.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ERFC_API_H__ +#define __INFINIOP_ERFC_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopErfcDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle, + infiniopErfcDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__INFINI_C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/erfinv.h b/include/infiniop/ops/erfinv.h new file mode 100644 index 000000000..b14975253 --- /dev/null +++ b/include/infiniop/ops/erfinv.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ERFINV_API_H__ +#define __INFINIOP_ERFINV_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopErfinvDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle, + infiniopErfinvDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__INFINI_C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/matrix_power.h b/include/infiniop/ops/matrix_power.h new file mode 100644 index 000000000..639d3cf02 --- /dev/null +++ b/include/infiniop/ops/matrix_power.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_MATRIX_POWER_API_H__ +#define __INFINIOP_MATRIX_POWER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMatrixPowerDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle, + infiniopMatrixPowerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int n); + +__INFINI_C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/pixel_shuffle.h b/include/infiniop/ops/pixel_shuffle.h new file mode 100644 index 000000000..fac8eff2c --- /dev/null +++ b/include/infiniop/ops/pixel_shuffle.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_PIXEL_SHUFFLE_API_H__ +#define __INFINIOP_PIXEL_SHUFFLE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopPixelShuffleDescriptor_t; + +__INFINI_C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle, + infiniopPixelShuffleDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int upscale_factor); + +__INFINI_C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size); + +__INFINI_C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__INFINI_C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc); + +#endif diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc new file mode 100644 index 000000000..7d127bfae --- /dev/null +++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc @@ -0,0 +1,52 @@ +#include "erf_cpu.h" + +namespace op::erf::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erf::cpu diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h new file mode 100644 index 000000000..74ad19e57 --- /dev/null +++ b/src/infiniop/ops/erf/cpu/erf_cpu.h @@ -0,0 +1,20 @@ +#ifndef __ERF_CPU_H__ +#define __ERF_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(erf, cpu) + +namespace op::erf::cpu { +typedef struct ErfOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return std::erf(x); + } +} ErfOp; +} // namespace op::erf::cpu + +#endif // __ERF_CPU_H__ diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh new file mode 100644 index 000000000..8efe037e3 --- /dev/null +++ b/src/infiniop/ops/erf/cuda/kernel.cuh @@ -0,0 +1,33 @@ +#pragma once +#include +#include + +namespace op::cuda { + +struct ErfOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(T x) const { + if constexpr (std::is_same_v) { + return erff(x); + } else if constexpr (std::is_same_v) { + return ::erf(x); + } else { + // For F16/BF16: promote to float, compute, then cast back + float xf; + if constexpr (std::is_same_v) { + xf = __half2float(x); + return __float2half_rn(erff(xf)); + } else if constexpr (std::is_same_v) { + xf = __bfloat162float(x); + return __float2bfloat16_rn(erff(xf)); + } else { + xf = static_cast(x); + return static_cast(erff(xf)); + } + } + } +}; + +} // namespace op::cuda diff --git a/src/infiniop/ops/erf/erf.h b/src/infiniop/ops/erf/erf.h new file mode 100644 index 000000000..7c967dea2 --- /dev/null +++ b/src/infiniop/ops/erf/erf.h @@ -0,0 +1,8 @@ +#ifndef __ERF_H__ +#define __ERF_H__ + +#include "../../elementwise/elementwise.h" + +#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erf, NAMESPACE) + +#endif // __ERF_H__ diff --git a/src/infiniop/ops/erf/metax/erf_metax.h b/src/infiniop/ops/erf/metax/erf_metax.h new file mode 100644 index 000000000..5dfe23bbe --- /dev/null +++ b/src/infiniop/ops/erf/metax/erf_metax.h @@ -0,0 +1,8 @@ +#ifndef __ERF_METAX_API_H__ +#define __ERF_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(erf, metax) + +#endif // __ERF_METAX_API_H__ diff --git a/src/infiniop/ops/erf/metax/erf_metax.maca b/src/infiniop/ops/erf/metax/erf_metax.maca new file mode 100644 index 000000000..fcf956af2 --- /dev/null +++ b/src/infiniop/ops/erf/metax/erf_metax.maca @@ -0,0 +1,60 @@ +#include "erf_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::erf::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erf::metax diff --git a/src/infiniop/ops/erf/moore/erf_moore.h b/src/infiniop/ops/erf/moore/erf_moore.h new file mode 100644 index 000000000..620055688 --- /dev/null +++ b/src/infiniop/ops/erf/moore/erf_moore.h @@ -0,0 +1,8 @@ +#ifndef __ERF_MOORE_API_H__ +#define __ERF_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(erf, moore) + +#endif // __ERF_MOORE_API_H__ diff --git a/src/infiniop/ops/erf/moore/erf_moore.mu b/src/infiniop/ops/erf/moore/erf_moore.mu new file mode 100644 index 000000000..856764c70 --- /dev/null +++ b/src/infiniop/ops/erf/moore/erf_moore.mu @@ -0,0 +1,60 @@ +#include "erf_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" + +namespace op::erf::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erf::moore diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu new file mode 100644 index 000000000..03e14bb57 --- /dev/null +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu @@ -0,0 +1,58 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "erf_nvidia.cuh" + +namespace op::erf::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erf::nvidia diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh new file mode 100644 index 000000000..d20658027 --- /dev/null +++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ERF_NVIDIA_H__ +#define __ERF_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(erf, nvidia) + +#endif // __ERF_NVIDIA_H__ diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc new file mode 100644 index 000000000..7ed264e33 --- /dev/null +++ b/src/infiniop/ops/erf/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/erf.h" + +#ifdef ENABLE_CPU_API +#include "cpu/erf_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/erf_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/erf_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/erf_moore.h" +#endif + +__INFINI_C __export infiniStatus_t infiniopCreateErfDescriptor( + infiniopHandle_t handle, + infiniopErfDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erf::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C __export infiniStatus_t infiniopErf( + infiniopErfDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C __export infiniStatus_t +infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/erfc/cpu/erfc_cpu.cc b/src/infiniop/ops/erfc/cpu/erfc_cpu.cc new file mode 100644 index 000000000..35b82c678 --- /dev/null +++ b/src/infiniop/ops/erfc/cpu/erfc_cpu.cc @@ -0,0 +1,52 @@ +#include "erfc_cpu.h" + +namespace op::erfc::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfc::cpu diff --git a/src/infiniop/ops/erfc/cpu/erfc_cpu.h b/src/infiniop/ops/erfc/cpu/erfc_cpu.h new file mode 100644 index 000000000..dd6d69496 --- /dev/null +++ b/src/infiniop/ops/erfc/cpu/erfc_cpu.h @@ -0,0 +1,20 @@ +#ifndef __ERFC_CPU_H__ +#define __ERFC_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(erfc, cpu) + +namespace op::erfc::cpu { +typedef struct ErfcOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return std::erfc(x); + } +} ErfcOp; +} // namespace op::erfc::cpu + +#endif // __ERFC_CPU_H__ diff --git a/src/infiniop/ops/erfc/cuda/kernel.cuh b/src/infiniop/ops/erfc/cuda/kernel.cuh new file mode 100644 index 000000000..5a448919d --- /dev/null +++ b/src/infiniop/ops/erfc/cuda/kernel.cuh @@ -0,0 +1,33 @@ +#pragma once +#include +#include + +namespace op::cuda { + +struct ErfcOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(T x) const { + if constexpr (std::is_same_v) { + return erfcf(x); + } else if constexpr (std::is_same_v) { + return ::erfc(x); + } else { + // For F16/BF16: promote to float, compute, then cast back + float xf; + if constexpr (std::is_same_v) { + xf = __half2float(x); + return __float2half_rn(erfcf(xf)); + } else if constexpr (std::is_same_v) { + xf = __bfloat162float(x); + return __float2bfloat16_rn(erfcf(xf)); + } else { + xf = static_cast(x); + return static_cast(erfcf(xf)); + } + } + } +}; + +} // namespace op::cuda diff --git a/src/infiniop/ops/erfc/erfc.h b/src/infiniop/ops/erfc/erfc.h new file mode 100644 index 000000000..9ee12fd43 --- /dev/null +++ b/src/infiniop/ops/erfc/erfc.h @@ -0,0 +1,8 @@ +#ifndef __ERFC_H__ +#define __ERFC_H__ + +#include "../../elementwise/elementwise.h" + +#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erfc, NAMESPACE) + +#endif // __ERFC_H__ diff --git a/src/infiniop/ops/erfc/metax/erfc_metax.h b/src/infiniop/ops/erfc/metax/erfc_metax.h new file mode 100644 index 000000000..438f00095 --- /dev/null +++ b/src/infiniop/ops/erfc/metax/erfc_metax.h @@ -0,0 +1,8 @@ +#ifndef __ERFC_METAX_API_H__ +#define __ERFC_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(erfc, metax) + +#endif // __ERFC_METAX_API_H__ diff --git a/src/infiniop/ops/erfc/metax/erfc_metax.maca b/src/infiniop/ops/erfc/metax/erfc_metax.maca new file mode 100644 index 000000000..925e52ccf --- /dev/null +++ b/src/infiniop/ops/erfc/metax/erfc_metax.maca @@ -0,0 +1,60 @@ +#include "erfc_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::erfc::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfc::metax diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.h b/src/infiniop/ops/erfc/moore/erfc_moore.h new file mode 100644 index 000000000..d032e4305 --- /dev/null +++ b/src/infiniop/ops/erfc/moore/erfc_moore.h @@ -0,0 +1,8 @@ +#ifndef __ERFC_MOORE_API_H__ +#define __ERFC_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(erfc, moore) + +#endif // __ERFC_MOORE_API_H__ diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.mu b/src/infiniop/ops/erfc/moore/erfc_moore.mu new file mode 100644 index 000000000..d58243e50 --- /dev/null +++ b/src/infiniop/ops/erfc/moore/erfc_moore.mu @@ -0,0 +1,60 @@ +#include "erfc_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" + +namespace op::erfc::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfc::moore diff --git a/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu new file mode 100644 index 000000000..483f11a18 --- /dev/null +++ b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu @@ -0,0 +1,58 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "erfc_nvidia.cuh" + +namespace op::erfc::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erfc::nvidia diff --git a/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh new file mode 100644 index 000000000..4d5321c9d --- /dev/null +++ b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ERFC_NVIDIA_H__ +#define __ERFC_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(erfc, nvidia) + +#endif // __ERFC_NVIDIA_H__ diff --git a/src/infiniop/ops/erfc/operator.cc b/src/infiniop/ops/erfc/operator.cc new file mode 100644 index 000000000..ebd692349 --- /dev/null +++ b/src/infiniop/ops/erfc/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/erfc.h" + +#ifdef ENABLE_CPU_API +#include "cpu/erfc_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/erfc_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/erfc_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/erfc_moore.h" +#endif + +__INFINI_C __export infiniStatus_t infiniopCreateErfcDescriptor( + infiniopHandle_t handle, + infiniopErfcDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erfc::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C __export infiniStatus_t infiniopErfc( + infiniopErfcDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C __export infiniStatus_t +infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc new file mode 100644 index 000000000..16c5c8cba --- /dev/null +++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc @@ -0,0 +1,52 @@ +#include "erfinv_cpu.h" + +namespace op::erfinv::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfinv::cpu diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h new file mode 100644 index 000000000..e7b1b723d --- /dev/null +++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h @@ -0,0 +1,56 @@ +#ifndef __ERFINV_CPU_H__ +#define __ERFINV_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include +#include + +ELEMENTWISE_DESCRIPTOR(erfinv, cpu) + +namespace op::erfinv::cpu { + +// Inverse error function implementation using Newton's method +template +T erfinv_impl(T x) { + // Domain: x in (-1, 1) + if (x == 1.0) { + return std::numeric_limits::infinity(); + } + if (x == -1.0) { + return -std::numeric_limits::infinity(); + } + if (x > 1.0 || x < -1.0) { + return std::numeric_limits::quiet_NaN(); + } + if (x == 0.0) { + return 0.0; + } + + // Use Newton's method to solve erf(y) = x + T y = x; // Initial guess + const int max_iter = 10; + const T tol = static_cast(1e-10); + + for (int i = 0; i < max_iter; ++i) { + T erf_y = std::erf(y); + T derf_dy = T(2.0) / T(std::sqrt(3.14159265358979323846) * std::exp(-y * y)); + T error = erf_y - x; + if (std::abs(error) < tol) { + break; + } + y = y - error / derf_dy; + } + return y; +} + +typedef struct ErfinvOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return erfinv_impl(x); + } +} ErfinvOp; +} // namespace op::erfinv::cpu + +#endif // __ERFINV_CPU_H__ diff --git a/src/infiniop/ops/erfinv/cuda/kernel.cuh b/src/infiniop/ops/erfinv/cuda/kernel.cuh new file mode 100644 index 000000000..18025331a --- /dev/null +++ b/src/infiniop/ops/erfinv/cuda/kernel.cuh @@ -0,0 +1,128 @@ +#pragma once +#include +#include +#include + +namespace op::cuda { + +constexpr float PI_F = 3.14159265358979323846f; +constexpr double PI = 3.14159265358979323846; + +// Inverse error function. +__device__ __forceinline__ float erfinv_impl(float x) { + if (x == 1.0f) { + return std::numeric_limits::infinity(); + } + if (x == -1.0f) { + return -std::numeric_limits::infinity(); + } + if (x > 1.0f || x < -1.0f) { + return std::numeric_limits::quiet_NaN(); + } + if (x == 0.0f) { + return 0.0f; + } + + // Winitzki approximation + const float a = 0.147f; + const float ln = log1pf(-x * x); + const float t = 2.0f / (PI_F * a) + ln * 0.5f; + + float inside = t * t - ln / a; + inside = inside > 0.0f ? inside : 0.0f; + + float y0 = copysignf(sqrtf(sqrtf(inside) - t), x); + + float y = y0; + const float sqrt_pi_f = sqrtf(PI_F); + +#pragma unroll + for (int i = 0; i < 4; ++i) { + const float erf_y = erff(y); + const float derf_dy = 2.0f / sqrt_pi_f * expf(-y * y); + y = y - (erf_y - x) / derf_dy; + } + + // Slow path near |x| ~ 1 + const float ax = fabsf(x); + if (1.0f - ax < 1e-4f) { + const double xd = static_cast(x); + double yd = static_cast(y); + const double sqrt_pi = sqrt(PI); + +#pragma unroll + for (int i = 0; i < 4; ++i) { + const double erf_y = erf(yd); + const double derf_dy = 2.0 / sqrt_pi * exp(-yd * yd); + yd = yd - (erf_y - xd) / derf_dy; + } + y = static_cast(yd); + } + + return y; +} + +struct ErfinvOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(T x) const { + if constexpr (std::is_same_v) { + return erfinv_impl(x); + + } else if constexpr (std::is_same_v) { + if (x == 1.0) { + return std::numeric_limits::infinity(); + } + if (x == -1.0) { + return -std::numeric_limits::infinity(); + } + if (x > 1.0 || x < -1.0) { + return std::numeric_limits::quiet_NaN(); + } + if (x == 0.0) { + return 0.0; + } + + const double a = 0.147; + const double ln = log1p(-x * x); + const double t = 2.0 / (PI * a) + ln * 0.5; + + double inside = t * t - ln / a; + inside = inside > 0.0 ? inside : 0.0; + + double y = copysign(sqrt(sqrt(inside) - t), x); + + const int max_iter = 30; + const double tol = 1e-14; + const double sqrt_pi = sqrt(PI); + + for (int i = 0; i < max_iter; ++i) { + const double erf_y = erf(y); + const double error = erf_y - x; + if (fabs(error) < tol) { + break; + } + const double derf_dy = 2.0 / sqrt_pi * exp(-y * y); + y = y - error / derf_dy; + } + return y; + + } else { + // F16 / BF16 / other types + float xf; + if constexpr (std::is_same_v) { + xf = __half2float(x); + return __float2half_rn(erfinv_impl(xf)); + } else if constexpr (std::is_same_v) { + xf = __bfloat162float(x); + return __float2bfloat16_rn(erfinv_impl(xf)); + } else { + xf = static_cast(x); + return static_cast(erfinv_impl(xf)); + } + } + } +}; + +} // namespace op::cuda diff --git a/src/infiniop/ops/erfinv/erfinv.h b/src/infiniop/ops/erfinv/erfinv.h new file mode 100644 index 000000000..f3ed9350f --- /dev/null +++ b/src/infiniop/ops/erfinv/erfinv.h @@ -0,0 +1,8 @@ +#ifndef __ERFINV_H__ +#define __ERFINV_H__ + +#include "../../elementwise/elementwise.h" + +#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erfinv, NAMESPACE) + +#endif // __ERFINV_H__ diff --git a/src/infiniop/ops/erfinv/metax/erfinv_metax.h b/src/infiniop/ops/erfinv/metax/erfinv_metax.h new file mode 100644 index 000000000..05058bfc6 --- /dev/null +++ b/src/infiniop/ops/erfinv/metax/erfinv_metax.h @@ -0,0 +1,8 @@ +#ifndef __ERFINV_METAX_API_H__ +#define __ERFINV_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(erfinv, metax) + +#endif // __ERFINV_METAX_API_H__ diff --git a/src/infiniop/ops/erfinv/metax/erfinv_metax.maca b/src/infiniop/ops/erfinv/metax/erfinv_metax.maca new file mode 100644 index 000000000..1e9144074 --- /dev/null +++ b/src/infiniop/ops/erfinv/metax/erfinv_metax.maca @@ -0,0 +1,60 @@ +#include "erfinv_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::erfinv::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfinv::metax diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.h b/src/infiniop/ops/erfinv/moore/erfinv_moore.h new file mode 100644 index 000000000..9eed18024 --- /dev/null +++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.h @@ -0,0 +1,8 @@ +#ifndef __ERFINV_MOORE_API_H__ +#define __ERFINV_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(erfinv, moore) + +#endif // __ERFINV_MOORE_API_H__ diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.mu b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu new file mode 100644 index 000000000..26febe69d --- /dev/null +++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu @@ -0,0 +1,60 @@ +#include "erfinv_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "../cuda/kernel.cuh" + +namespace op::erfinv::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::erfinv::moore diff --git a/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu new file mode 100644 index 000000000..35f5d3fe2 --- /dev/null +++ b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu @@ -0,0 +1,58 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "erfinv_nvidia.cuh" + +namespace op::erfinv::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::erfinv::nvidia diff --git a/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh new file mode 100644 index 000000000..af80be12f --- /dev/null +++ b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ERFINV_NVIDIA_H__ +#define __ERFINV_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(erfinv, nvidia) + +#endif // __ERFINV_NVIDIA_H__ diff --git a/src/infiniop/ops/erfinv/operator.cc b/src/infiniop/ops/erfinv/operator.cc new file mode 100644 index 000000000..c0453562b --- /dev/null +++ b/src/infiniop/ops/erfinv/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/erfinv.h" + +#ifdef ENABLE_CPU_API +#include "cpu/erfinv_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/erfinv_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/erfinv_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/erfinv_moore.h" +#endif + +__INFINI_C __export infiniStatus_t infiniopCreateErfinvDescriptor( + infiniopHandle_t handle, + infiniopErfinvDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::erfinv::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C __export infiniStatus_t infiniopErfinv( + infiniopErfinvDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C __export infiniStatus_t +infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc new file mode 100644 index 000000000..6c2fe60cd --- /dev/null +++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc @@ -0,0 +1,201 @@ +#include "matrix_power_cpu.h" +#include "../../../tensor.h" +#include +#include + +namespace op::matrix_power::cpu { + +utils::Result MatrixPowerInfo::create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int n) { + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (y_shape != x_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (n < 0) { + return INFINI_STATUS_BAD_PARAM; + } + + MatrixPowerInfo info; + info.matrix_size = x_shape[0]; + info.n = static_cast(n); + info.input_size = x_desc->numel(); + info.output_size = y_desc->numel(); + + return utils::Result(std::move(info)); +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + auto info_result = MatrixPowerInfo::create(x_desc, y_desc, n); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +void matrix_power_impl( + const MatrixPowerInfo &info, + T *y, + const T *x, + void *workspace) { + + size_t n = info.matrix_size; + int power = static_cast(info.n); + + // Use workspace for temporary matrices + T *temp1 = reinterpret_cast(workspace); + T *temp2 = temp1 + n * n; + + // Initialize result as identity matrix + std::memset(y, 0, n * n * sizeof(T)); + for (size_t i = 0; i < n; ++i) { + y[i * n + i] = utils::cast(1.0); + } + + // Copy input to temp1 + std::memcpy(temp1, x, n * n * sizeof(T)); + + // Binary exponentiation + while (power > 0) { + if (power & 1) { + // Multiply result by temp1 + std::memset(temp2, 0, n * n * sizeof(T)); + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + T val = y[i * n + k]; + for (size_t j = 0; j < n; ++j) { + temp2[i * n + j] += val * temp1[k * n + j]; + } + } + } + std::memcpy(y, temp2, n * n * sizeof(T)); + } + // Square temp1 + std::memset(temp2, 0, n * n * sizeof(T)); + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + T val = temp1[i * n + k]; + for (size_t j = 0; j < n; ++j) { + temp2[i * n + j] += val * temp1[k * n + j]; + } + } + } + std::memcpy(temp1, temp2, n * n * sizeof(T)); + power >>= 1; + } +} + +template +void write_identity_impl(size_t n, T *y) { + std::fill(y, y + n * n, utils::cast(0.0)); + for (size_t i = 0; i < n; ++i) { + y[i * n + i] = utils::cast(1.0); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (_info.matrix_size == 0) { + return INFINI_STATUS_SUCCESS; + } + if (_info.n == 0) { + const size_t n = _info.matrix_size; + switch (_dtype) { + case INFINI_DTYPE_F16: + write_identity_impl(n, reinterpret_cast(y)); + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_BF16: + write_identity_impl(n, reinterpret_cast(y)); + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F32: + write_identity_impl(n, reinterpret_cast(y)); + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F64: + write_identity_impl(n, reinterpret_cast(y)); + return INFINI_STATUS_SUCCESS; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + } + + if (workspace_size < this->workspaceSize()) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + if (this->workspaceSize() > 0 && workspace == nullptr) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: { + // Convert to float for computation + std::vector x_f(_info.input_size); + std::vector y_f(_info.output_size); + float *workspace_f = reinterpret_cast(workspace); + for (size_t i = 0; i < _info.input_size; ++i) { + x_f[i] = utils::cast(reinterpret_cast(x)[i]); + } + MatrixPowerInfo info_f = _info; + matrix_power_impl(info_f, y_f.data(), x_f.data(), workspace_f); + for (size_t i = 0; i < _info.output_size; ++i) { + reinterpret_cast(y)[i] = utils::cast(y_f[i]); + } + break; + } + case INFINI_DTYPE_BF16: { + std::vector x_f(_info.input_size); + std::vector y_f(_info.output_size); + float *workspace_f = reinterpret_cast(workspace); + for (size_t i = 0; i < _info.input_size; ++i) { + x_f[i] = utils::cast(reinterpret_cast(x)[i]); + } + MatrixPowerInfo info_f = _info; + matrix_power_impl(info_f, y_f.data(), x_f.data(), workspace_f); + for (size_t i = 0; i < _info.output_size; ++i) { + reinterpret_cast(y)[i] = utils::cast(y_f[i]); + } + break; + } + case INFINI_DTYPE_F32: + matrix_power_impl(_info, reinterpret_cast(y), reinterpret_cast(x), workspace); + break; + case INFINI_DTYPE_F64: + matrix_power_impl(_info, reinterpret_cast(y), reinterpret_cast(x), workspace); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::matrix_power::cpu diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h new file mode 100644 index 000000000..06a14c341 --- /dev/null +++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h @@ -0,0 +1,69 @@ +#ifndef __MATRIX_POWER_CPU_H__ +#define __MATRIX_POWER_CPU_H__ + +#include "../../../devices/cpu/common_cpu.h" +#include "../../../operator.h" +#include + +namespace op::matrix_power::cpu { + +struct MatrixPowerInfo { + size_t matrix_size; // N x N matrix + size_t n; // Power + size_t input_size; + size_t output_size; + + static utils::Result create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int n); +}; + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + MatrixPowerInfo _info; + + Descriptor(infiniDtype_t dtype, MatrixPowerInfo info, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n); + + size_t workspaceSize() const { + if (_info.n == 0 || _info.matrix_size == 0) { + return 0; + } + const size_t elems = 2 * _info.matrix_size * _info.matrix_size; + switch (_dtype) { + case INFINI_DTYPE_F16: + case INFINI_DTYPE_BF16: + case INFINI_DTYPE_F32: + return elems * sizeof(float); + case INFINI_DTYPE_F64: + return elems * sizeof(double); + default: + return 0; + } + } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::matrix_power::cpu + +#endif // __MATRIX_POWER_CPU_H__ diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h new file mode 100644 index 000000000..43e561f66 --- /dev/null +++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h @@ -0,0 +1,47 @@ +#ifndef __MATRIX_POWER_METAX_H__ +#define __MATRIX_POWER_METAX_H__ + +#include "../../../operator.h" + +namespace op::matrix_power::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t matrix_size; + size_t n; + size_t input_size; + size_t output_size; + + Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n, + size_t input_size, size_t output_size, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + matrix_size(matrix_size), + n(n), + input_size(input_size), + output_size(output_size) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::matrix_power::metax + +#endif // __MATRIX_POWER_METAX_H__ diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca new file mode 100644 index 000000000..221757141 --- /dev/null +++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca @@ -0,0 +1,107 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../tensor.h" +#include "matrix_power_metax.h" +#include +#include + +namespace op::matrix_power::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (y_shape != x_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + CHECK_OR_RETURN(n >= 0, INFINI_STATUS_BAD_PARAM); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + *desc_ptr = new Descriptor(dtype, x_shape[0], static_cast(n), + x_desc->numel(), y_desc->numel(), + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (workspace_size < this->workspaceSize()) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + auto hc_stream = reinterpret_cast(stream); + CHECK_OR_RETURN(_dtype == INFINI_DTYPE_F32, INFINI_STATUS_BAD_TENSOR_DTYPE); + size_t input_bytes = input_size * sizeof(float); + + // Use CPU fallback for now + std::vector h_matrix(input_size); + CHECK_METAX(hcMemcpyAsync(h_matrix.data(), x, input_bytes, hcMemcpyDeviceToHost, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + + std::vector result(output_size, 0.0f); + std::vector temp1(input_size); + std::vector temp2(input_size); + std::memcpy(temp1.data(), h_matrix.data(), input_bytes); + + for (size_t i = 0; i < matrix_size; ++i) { + result[i * matrix_size + i] = 1.0f; + } + + int power = static_cast(n); + while (power > 0) { + if (power & 1) { + std::fill(temp2.begin(), temp2.end(), 0.0f); + for (size_t i = 0; i < matrix_size; ++i) { + for (size_t k = 0; k < matrix_size; ++k) { + float val = result[i * matrix_size + k]; + for (size_t j = 0; j < matrix_size; ++j) { + temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j]; + } + } + } + std::memcpy(result.data(), temp2.data(), output_size * sizeof(float)); + } + std::fill(temp2.begin(), temp2.end(), 0.0f); + for (size_t i = 0; i < matrix_size; ++i) { + for (size_t k = 0; k < matrix_size; ++k) { + float val = temp1[i * matrix_size + k]; + for (size_t j = 0; j < matrix_size; ++j) { + temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j]; + } + } + } + std::memcpy(temp1.data(), temp2.data(), input_bytes); + power >>= 1; + } + + CHECK_METAX(hcMemcpyAsync(y, result.data(), input_bytes, hcMemcpyHostToDevice, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::matrix_power::metax diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h new file mode 100644 index 000000000..42dd5c180 --- /dev/null +++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h @@ -0,0 +1,47 @@ +#ifndef __MATRIX_POWER_MOORE_H__ +#define __MATRIX_POWER_MOORE_H__ + +#include "../../../operator.h" + +namespace op::matrix_power::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t matrix_size; + size_t n; + size_t input_size; + size_t output_size; + + Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n, + size_t input_size, size_t output_size, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + matrix_size(matrix_size), + n(n), + input_size(input_size), + output_size(output_size) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::matrix_power::moore + +#endif // __MATRIX_POWER_MOORE_H__ diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu new file mode 100644 index 000000000..938ad5c0a --- /dev/null +++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu @@ -0,0 +1,106 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "../../../tensor.h" +#include "matrix_power_moore.h" +#include +#include + +namespace op::matrix_power::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (y_shape != x_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + CHECK_OR_RETURN(n >= 0, INFINI_STATUS_BAD_PARAM); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + *desc_ptr = new Descriptor(dtype, x_shape[0], static_cast(n), + x_desc->numel(), y_desc->numel(), + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (workspace_size < this->workspaceSize()) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + auto musa_stream = reinterpret_cast(stream); + CHECK_OR_RETURN(_dtype == INFINI_DTYPE_F32, INFINI_STATUS_BAD_TENSOR_DTYPE); + size_t input_bytes = input_size * sizeof(float); + + std::vector h_matrix(input_size); + CHECK_MOORE(musaMemcpyAsync(h_matrix.data(), x, input_bytes, musaMemcpyDeviceToHost, musa_stream)); + CHECK_MOORE(musaStreamSynchronize(musa_stream)); + + std::vector result(output_size, 0.0f); + std::vector temp1(input_size); + std::vector temp2(input_size); + std::memcpy(temp1.data(), h_matrix.data(), input_bytes); + + for (size_t i = 0; i < matrix_size; ++i) { + result[i * matrix_size + i] = 1.0f; + } + + int power = static_cast(n); + while (power > 0) { + if (power & 1) { + std::fill(temp2.begin(), temp2.end(), 0.0f); + for (size_t i = 0; i < matrix_size; ++i) { + for (size_t k = 0; k < matrix_size; ++k) { + float val = result[i * matrix_size + k]; + for (size_t j = 0; j < matrix_size; ++j) { + temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j]; + } + } + } + std::memcpy(result.data(), temp2.data(), output_size * sizeof(float)); + } + std::fill(temp2.begin(), temp2.end(), 0.0f); + for (size_t i = 0; i < matrix_size; ++i) { + for (size_t k = 0; k < matrix_size; ++k) { + float val = temp1[i * matrix_size + k]; + for (size_t j = 0; j < matrix_size; ++j) { + temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j]; + } + } + } + std::memcpy(temp1.data(), temp2.data(), input_bytes); + power >>= 1; + } + + CHECK_MOORE(musaMemcpyAsync(y, result.data(), input_bytes, musaMemcpyHostToDevice, musa_stream)); + CHECK_MOORE(musaStreamSynchronize(musa_stream)); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::matrix_power::moore diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu new file mode 100644 index 000000000..d728238c1 --- /dev/null +++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu @@ -0,0 +1,546 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../tensor.h" +#include "matrix_power_nvidia.cuh" +#include +#include + +namespace op::matrix_power::nvidia { + +namespace { + +template +__forceinline__ __device__ T identityZero(); + +template +__forceinline__ __device__ T identityOne(); + +template <> +__forceinline__ __device__ __half identityZero<__half>() { + return __float2half(0.0f); +} + +template <> +__forceinline__ __device__ __half identityOne<__half>() { + return __float2half(1.0f); +} + +template <> +__forceinline__ __device__ cuda_bfloat16 identityZero() { + return __float2bfloat16(0.0f); +} + +template <> +__forceinline__ __device__ cuda_bfloat16 identityOne() { + return __float2bfloat16(1.0f); +} + +template <> +__forceinline__ __device__ float identityZero() { + return 0.0f; +} + +template <> +__forceinline__ __device__ float identityOne() { + return 1.0f; +} + +template <> +__forceinline__ __device__ double identityZero() { + return 0.0; +} + +template <> +__forceinline__ __device__ double identityOne() { + return 1.0; +} + +template +INFINIOP_CUDA_KERNEL packMatrix2dStridedToContiguous( + const T *src, + T *dst, + size_t matrix_size, + ptrdiff_t src_stride_0, + ptrdiff_t src_stride_1) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t matrix_numel = matrix_size * matrix_size; + if (idx < matrix_numel) { + size_t row = idx / matrix_size; + size_t col = idx - row * matrix_size; + dst[idx] = src[row * src_stride_0 + col * src_stride_1]; + } +} + +template +INFINIOP_CUDA_KERNEL scatterMatrix2dContiguousToStrided( + const T *src, + T *dst, + size_t matrix_size, + ptrdiff_t dst_stride_0, + ptrdiff_t dst_stride_1) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t matrix_numel = matrix_size * matrix_size; + if (idx < matrix_numel) { + size_t row = idx / matrix_size; + size_t col = idx - row * matrix_size; + dst[row * dst_stride_0 + col * dst_stride_1] = src[idx]; + } +} + +template +INFINIOP_CUDA_KERNEL setIdentity2dStrided( + T *out, + size_t matrix_size, + ptrdiff_t out_stride_0, + ptrdiff_t out_stride_1) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t matrix_numel = matrix_size * matrix_size; + if (idx < matrix_numel) { + size_t row = idx / matrix_size; + size_t col = idx - row * matrix_size; + out[row * out_stride_0 + col * out_stride_1] = (row == col) ? identityOne() : identityZero(); + } +} + +INFINIOP_CUDA_KERNEL setDiagonalFp16(__half *out, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx * n + idx] = __float2half(1.0f); + } +} + +INFINIOP_CUDA_KERNEL setDiagonalBf16(cuda_bfloat16 *out, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx * n + idx] = __float2bfloat16(1.0f); + } +} + +INFINIOP_CUDA_KERNEL setDiagonalFp32(float *out, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx * n + idx] = 1.0f; + } +} + +INFINIOP_CUDA_KERNEL setDiagonalFp64(double *out, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx * n + idx] = 1.0; + } +} + +infiniStatus_t initializeIdentity( + void *y, + infiniDtype_t dtype, + size_t matrix_size, + bool y_contiguous, + ptrdiff_t y_stride_0, + ptrdiff_t y_stride_1, + cudaStream_t stream) { + + if (matrix_size == 0) { + return INFINI_STATUS_SUCCESS; + } + + constexpr int threads = 256; + size_t diag_blocks = CEIL_DIV(matrix_size, static_cast(threads)); + size_t matrix_numel = matrix_size * matrix_size; + size_t matrix_blocks = CEIL_DIV(matrix_numel, static_cast(threads)); + + if (y_contiguous) { + CHECK_CUDA(cudaMemsetAsync(y, 0, matrix_numel * infiniSizeOf(dtype), stream)); + } + + switch (dtype) { + case INFINI_DTYPE_F16: + if (y_contiguous) { + setDiagonalFp16<<(diag_blocks), threads, 0, stream>>>( + reinterpret_cast<__half *>(y), matrix_size); + } else { + setIdentity2dStrided<<(matrix_blocks), threads, 0, stream>>>( + reinterpret_cast<__half *>(y), matrix_size, y_stride_0, y_stride_1); + } + break; + case INFINI_DTYPE_BF16: + if (y_contiguous) { + setDiagonalBf16<<(diag_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size); + } else { + setIdentity2dStrided<<(matrix_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size, y_stride_0, y_stride_1); + } + break; + case INFINI_DTYPE_F32: + if (y_contiguous) { + setDiagonalFp32<<(diag_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size); + } else { + setIdentity2dStrided<<(matrix_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size, y_stride_0, y_stride_1); + } + break; + case INFINI_DTYPE_F64: + if (y_contiguous) { + setDiagonalFp64<<(diag_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size); + } else { + setIdentity2dStrided<<(matrix_blocks), threads, 0, stream>>>( + reinterpret_cast(y), matrix_size, y_stride_0, y_stride_1); + } + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_CUDA(cudaGetLastError()); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t packInputToContiguous( + void *dst, + const void *src, + infiniDtype_t dtype, + size_t matrix_size, + ptrdiff_t src_stride_0, + ptrdiff_t src_stride_1, + cudaStream_t stream) { + constexpr int threads = 256; + size_t matrix_numel = matrix_size * matrix_size; + size_t blocks = CEIL_DIV(matrix_numel, static_cast(threads)); + switch (dtype) { + case INFINI_DTYPE_F16: + packMatrix2dStridedToContiguous<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast<__half *>(dst), + matrix_size, src_stride_0, src_stride_1); + break; + case INFINI_DTYPE_BF16: + packMatrix2dStridedToContiguous<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, src_stride_0, src_stride_1); + break; + case INFINI_DTYPE_F32: + packMatrix2dStridedToContiguous<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, src_stride_0, src_stride_1); + break; + case INFINI_DTYPE_F64: + packMatrix2dStridedToContiguous<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, src_stride_0, src_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_CUDA(cudaGetLastError()); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t scatterContiguousToOutput( + void *dst, + const void *src, + infiniDtype_t dtype, + size_t matrix_size, + ptrdiff_t dst_stride_0, + ptrdiff_t dst_stride_1, + cudaStream_t stream) { + constexpr int threads = 256; + size_t matrix_numel = matrix_size * matrix_size; + size_t blocks = CEIL_DIV(matrix_numel, static_cast(threads)); + switch (dtype) { + case INFINI_DTYPE_F16: + scatterMatrix2dContiguousToStrided<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast<__half *>(dst), + matrix_size, dst_stride_0, dst_stride_1); + break; + case INFINI_DTYPE_BF16: + scatterMatrix2dContiguousToStrided<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, dst_stride_0, dst_stride_1); + break; + case INFINI_DTYPE_F32: + scatterMatrix2dContiguousToStrided<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, dst_stride_0, dst_stride_1); + break; + case INFINI_DTYPE_F64: + scatterMatrix2dContiguousToStrided<<(blocks), threads, 0, stream>>>( + reinterpret_cast(src), reinterpret_cast(dst), + matrix_size, dst_stride_0, dst_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_CUDA(cudaGetLastError()); + return INFINI_STATUS_SUCCESS; +} + +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) +using GemmComputeType = cudaDataType; +#else +using GemmComputeType = cublasComputeType_t; +#endif + +struct GemmTypeConfig { + cudaDataType io_type; + GemmComputeType compute_type; +}; + +infiniStatus_t getGemmTypeConfig(infiniDtype_t dtype, GemmTypeConfig &cfg) { + switch (dtype) { + case INFINI_DTYPE_F16: + cfg.io_type = CUDA_R_16F; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cfg.compute_type = CUDA_R_32F; +#else + cfg.compute_type = CUBLAS_COMPUTE_32F; +#endif + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_BF16: + cfg.io_type = CUDA_R_16BF; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cfg.compute_type = CUDA_R_32F; +#else + cfg.compute_type = CUBLAS_COMPUTE_32F; +#endif + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F32: + cfg.io_type = CUDA_R_32F; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cfg.compute_type = CUDA_R_32F; +#else + cfg.compute_type = CUBLAS_COMPUTE_32F; +#endif + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F64: + cfg.io_type = CUDA_R_64F; +#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API) + cfg.compute_type = CUDA_R_64F; +#else + cfg.compute_type = CUBLAS_COMPUTE_64F; +#endif + return INFINI_STATUS_SUCCESS; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +// Compute row-major C = A * B using cuBLAS column-major GEMM: +// C_col = B_col * A_col, where *_col views the same memory as column-major. +infiniStatus_t gemmRowMajorSquare( + cublasHandle_t handle, + const GemmTypeConfig &cfg, + infiniDtype_t dtype, + int n, + const void *a, + const void *b, + void *c) { + + if (dtype == INFINI_DTYPE_F64) { + const double alpha = 1.0; + const double beta = 0.0; + CHECK_CUBLAS(cublasGemmEx( + handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + n, + n, + &alpha, + b, + cfg.io_type, + n, + a, + cfg.io_type, + n, + &beta, + c, + cfg.io_type, + n, + cfg.compute_type, + CUBLAS_GEMM_DEFAULT)); + return INFINI_STATUS_SUCCESS; + } + + const float alpha = 1.0f; + const float beta = 0.0f; + CHECK_CUBLAS(cublasGemmEx( + handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + n, + n, + &alpha, + b, + cfg.io_type, + n, + a, + cfg.io_type, + n, + &beta, + c, + cfg.io_type, + n, + cfg.compute_type, + CUBLAS_GEMM_DEFAULT)); + return INFINI_STATUS_SUCCESS; +} + +} // namespace + +struct Descriptor::Opaque { + std::shared_ptr internal; + + Opaque(std::shared_ptr internal_) + : internal(internal_) {} +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + + if (handle == nullptr || desc_ptr == nullptr || y_desc == nullptr || x_desc == nullptr) { + return INFINI_STATUS_BAD_PARAM; + } + if (n < 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + if (y_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (y_shape != x_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (x_desc->hasBroadcastDim() || y_desc->hasBroadcastDim()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + if (x_shape[0] > static_cast(std::numeric_limits::max())) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_strides = x_desc->strides(); + auto y_strides = y_desc->strides(); + bool x_contiguous = x_desc->isContiguous(); + bool y_contiguous = y_desc->isContiguous(); + + size_t matrix_numel = x_desc->numel(); + size_t matrix_bytes = matrix_numel * infiniSizeOf(dtype); + size_t workspace_size = 0; + if (n != 0) { + workspace_size = matrix_bytes * (y_contiguous ? 2 : 3); + } + + auto handle_nvidia = reinterpret_cast(handle); + Descriptor *desc = new Descriptor(dtype, x_shape[0], static_cast(n), + matrix_numel, y_desc->numel(), workspace_size, + x_strides[0], x_strides[1], y_strides[0], y_strides[1], + x_contiguous, y_contiguous, + handle->device, handle->device_id); + desc->_opaque = new Opaque(handle_nvidia->internal()); + *desc_ptr = desc; + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (x == nullptr || y == nullptr) { + return INFINI_STATUS_BAD_PARAM; + } + if (workspace_size < this->workspaceSize()) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + if (this->workspaceSize() != 0 && workspace == nullptr) { + return INFINI_STATUS_BAD_PARAM; + } + + auto cuda_stream = reinterpret_cast(stream); + if (matrix_size == 0) { + return INFINI_STATUS_SUCCESS; + } + if (n == 0) { + CHECK_STATUS(initializeIdentity( + y, _dtype, matrix_size, y_contiguous, y_stride_0, y_stride_1, cuda_stream)); + return INFINI_STATUS_SUCCESS; + } + + size_t matrix_bytes = input_size * infiniSizeOf(_dtype); + char *workspace_ptr = reinterpret_cast(workspace); + void *base = workspace_ptr; + void *temp = workspace_ptr + matrix_bytes; + void *contiguous_output = y_contiguous ? y : (workspace_ptr + matrix_bytes * 2); + + CHECK_STATUS(initializeIdentity( + contiguous_output, _dtype, matrix_size, true, 0, 0, cuda_stream)); + + if (x_contiguous) { + CHECK_CUDA(cudaMemcpyAsync(base, x, matrix_bytes, cudaMemcpyDeviceToDevice, cuda_stream)); + } else { + CHECK_STATUS(packInputToContiguous( + base, x, _dtype, matrix_size, x_stride_0, x_stride_1, cuda_stream)); + } + + GemmTypeConfig cfg; + CHECK_STATUS(getGemmTypeConfig(_dtype, cfg)); + + void *result = contiguous_output; + void *scratch = temp; + void *base_matrix = base; + size_t power = n; + int matrix_dim = static_cast(matrix_size); + + CHECK_STATUS(_opaque->internal->useCublas( + cuda_stream, + [&](cublasHandle_t handle) { + while (power > 0) { + if (power & 1) { + CHECK_STATUS(gemmRowMajorSquare(handle, cfg, _dtype, matrix_dim, result, base_matrix, scratch)); + std::swap(result, scratch); + } + power >>= 1; + if (power == 0) { + break; + } + CHECK_STATUS(gemmRowMajorSquare(handle, cfg, _dtype, matrix_dim, base_matrix, base_matrix, scratch)); + std::swap(base_matrix, scratch); + } + return INFINI_STATUS_SUCCESS; + })); + + if (y_contiguous) { + if (result != y) { + CHECK_CUDA(cudaMemcpyAsync(y, result, matrix_bytes, cudaMemcpyDeviceToDevice, cuda_stream)); + } + } else { + CHECK_STATUS(scatterContiguousToOutput( + y, result, _dtype, matrix_size, y_stride_0, y_stride_1, cuda_stream)); + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::matrix_power::nvidia diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh new file mode 100644 index 000000000..d11c6914e --- /dev/null +++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh @@ -0,0 +1,71 @@ +#ifndef __MATRIX_POWER_NVIDIA_H__ +#define __MATRIX_POWER_NVIDIA_H__ + +#include "../../../operator.h" +#include + +namespace op::matrix_power::nvidia { + +class Descriptor final : public InfiniopDescriptor { + struct Opaque; + Opaque *_opaque; + infiniDtype_t _dtype; + size_t matrix_size; + size_t n; + size_t input_size; + size_t output_size; + size_t workspace_size; + ptrdiff_t x_stride_0; + ptrdiff_t x_stride_1; + ptrdiff_t y_stride_0; + ptrdiff_t y_stride_1; + bool x_contiguous; + bool y_contiguous; + + Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n, + size_t input_size, size_t output_size, + size_t workspace_size, + ptrdiff_t x_stride_0, ptrdiff_t x_stride_1, + ptrdiff_t y_stride_0, ptrdiff_t y_stride_1, + bool x_contiguous, bool y_contiguous, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _opaque(nullptr), + _dtype(dtype), + matrix_size(matrix_size), + n(n), + input_size(input_size), + output_size(output_size), + workspace_size(workspace_size), + x_stride_0(x_stride_0), + x_stride_1(x_stride_1), + y_stride_0(y_stride_0), + y_stride_1(y_stride_1), + x_contiguous(x_contiguous), + y_contiguous(y_contiguous) {} + +public: + ~Descriptor(); + + friend struct Opaque; + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n); + + size_t workspaceSize() const { return workspace_size; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::matrix_power::nvidia + +#endif // __MATRIX_POWER_NVIDIA_H__ diff --git a/src/infiniop/ops/matrix_power/operator.cc b/src/infiniop/ops/matrix_power/operator.cc new file mode 100644 index 000000000..79a81077a --- /dev/null +++ b/src/infiniop/ops/matrix_power/operator.cc @@ -0,0 +1,159 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/matrix_power.h" + +#ifdef ENABLE_CPU_API +#include "cpu/matrix_power_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/matrix_power_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/matrix_power_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/matrix_power_moore.h" +#endif + +__INFINI_C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor( + infiniopHandle_t handle, + infiniopMatrixPowerDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int n) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::matrix_power::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + n) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C __export infiniStatus_t infiniopMatrixPower( + infiniopMatrixPowerDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C __export infiniStatus_t +infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc new file mode 100644 index 000000000..15d0d6906 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc @@ -0,0 +1,137 @@ +#include "pixel_shuffle_cpu.h" +#include "../../../tensor.h" + +namespace op::pixel_shuffle::cpu { + +utils::Result PixelShuffleInfo::create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int upscale_factor) { + + if (upscale_factor <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t batch = x_shape[0]; + size_t in_channels = x_shape[1]; + size_t height = x_shape[2]; + size_t width = x_shape[3]; + + // Input: (N, C*r^2, H, W) -> Output: (N, C, H*r, W*r) + if (in_channels % (upscale_factor * upscale_factor) != 0) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t out_channels = in_channels / (upscale_factor * upscale_factor); + size_t out_height = height * upscale_factor; + size_t out_width = width * upscale_factor; + + std::vector expected_y_shape = {batch, out_channels, out_height, out_width}; + if (y_shape != expected_y_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + PixelShuffleInfo info; + info.batch = batch; + info.in_channels = in_channels; + info.out_channels = out_channels; + info.height = height; + info.width = width; + info.upscale_factor = upscale_factor; + info.input_size = x_desc->numel(); + info.output_size = y_desc->numel(); + + return utils::Result(std::move(info)); +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + auto info_result = PixelShuffleInfo::create(x_desc, y_desc, upscale_factor); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +void pixel_shuffle_impl( + const PixelShuffleInfo &info, + T *y, + const T *x) { + + int r = info.upscale_factor; + + // Input: (N, C*r^2, H, W) + // Output: (N, C, H*r, W*r) + for (size_t n = 0; n < info.batch; ++n) { + for (size_t c = 0; c < info.out_channels; ++c) { + for (size_t h = 0; h < info.height; ++h) { + for (size_t w = 0; w < info.width; ++w) { + for (int i = 0; i < r; ++i) { + for (int j = 0; j < r; ++j) { + // Input channel index + size_t in_c = c * r * r + i * r + j; + // Input position + size_t in_idx = ((n * info.in_channels + in_c) * info.height + h) * info.width + w; + // Output position + size_t out_h = h * r + i; + size_t out_w = w * r + j; + size_t out_idx = ((n * info.out_channels + c) * (info.height * r) + out_h) * (info.width * r) + out_w; + y[out_idx] = x[in_idx]; + } + } + } + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + pixel_shuffle_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_BF16: + pixel_shuffle_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_F32: + pixel_shuffle_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + case INFINI_DTYPE_F64: + pixel_shuffle_impl(_info, reinterpret_cast(y), reinterpret_cast(x)); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::pixel_shuffle::cpu diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h new file mode 100644 index 000000000..2fa5e00fe --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h @@ -0,0 +1,58 @@ +#ifndef __PIXEL_SHUFFLE_CPU_H__ +#define __PIXEL_SHUFFLE_CPU_H__ + +#include "../../../devices/cpu/common_cpu.h" +#include "../../../operator.h" +#include + +namespace op::pixel_shuffle::cpu { + +struct PixelShuffleInfo { + size_t batch; + size_t in_channels; + size_t out_channels; + size_t height; + size_t width; + int upscale_factor; + size_t input_size; + size_t output_size; + + static utils::Result create( + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc, + int upscale_factor); +}; + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + PixelShuffleInfo _info; + + Descriptor(infiniDtype_t dtype, PixelShuffleInfo info, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::pixel_shuffle::cpu + +#endif // __PIXEL_SHUFFLE_CPU_H__ diff --git a/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh new file mode 100644 index 000000000..874b29cce --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh @@ -0,0 +1,92 @@ +#pragma once +#include +#include + +namespace op::cuda { + +template +__global__ void pixel_shuffle_kernel( + T *output, + const T *input, + size_t batch, + size_t out_channels, + size_t height, + size_t width, + int r) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = batch * out_channels * height * width; + + if (idx >= total) { + return; + } + + size_t n = idx / (out_channels * height * width); + size_t rem = idx % (out_channels * height * width); + size_t c = rem / (height * width); + rem = rem % (height * width); + size_t oh = rem / width; + size_t ow = rem % width; + + // Calculate input indices + size_t w = ow / r; + size_t h = oh / r; + size_t i = oh % r; + size_t j = ow % r; + size_t in_c = c * r * r + i * r + j; + + size_t in_idx = ((n * (out_channels * r * r) + in_c) * (height / r) + h) * (width / r) + w; + size_t out_idx = ((n * out_channels + c) * height + oh) * width + ow; + + output[out_idx] = input[in_idx]; +} + +template +__global__ void pixel_shuffle_kernel_strided( + T *output, + const T *input, + size_t batch, + size_t out_channels, + size_t out_height, + size_t out_width, + int r, + ptrdiff_t x_stride0, + ptrdiff_t x_stride1, + ptrdiff_t x_stride2, + ptrdiff_t x_stride3, + ptrdiff_t y_stride0, + ptrdiff_t y_stride1, + ptrdiff_t y_stride2, + ptrdiff_t y_stride3) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = batch * out_channels * out_height * out_width; + + if (idx >= total) { + return; + } + + const size_t spatial = out_height * out_width; + const size_t chw = out_channels * spatial; + + size_t n = idx / chw; + size_t rem = idx % chw; + size_t c = rem / spatial; + rem = rem % spatial; + size_t oh = rem / out_width; + size_t ow = rem % out_width; + + const size_t upscale = static_cast(r); + const size_t ih = oh / upscale; + const size_t iw = ow / upscale; + const size_t i = oh % upscale; + const size_t j = ow % upscale; + const size_t in_c = c * upscale * upscale + i * upscale + j; + + const ptrdiff_t in_offset = static_cast(n) * x_stride0 + static_cast(in_c) * x_stride1 + static_cast(ih) * x_stride2 + static_cast(iw) * x_stride3; + const ptrdiff_t out_offset = static_cast(n) * y_stride0 + static_cast(c) * y_stride1 + static_cast(oh) * y_stride2 + static_cast(ow) * y_stride3; + + output[out_offset] = input[in_offset]; +} + +} // namespace op::cuda diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h new file mode 100644 index 000000000..34f216655 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h @@ -0,0 +1,56 @@ +#ifndef __PIXEL_SHUFFLE_METAX_H__ +#define __PIXEL_SHUFFLE_METAX_H__ + +#include "../../../operator.h" + +namespace op::pixel_shuffle::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t batch; + size_t in_channels; + size_t out_channels; + size_t height; + size_t width; + int upscale_factor; + size_t input_size; + size_t output_size; + + Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels, + size_t height, size_t width, int upscale_factor, + size_t input_size, size_t output_size, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + batch(batch), + in_channels(in_channels), + out_channels(out_channels), + height(height), + width(width), + upscale_factor(upscale_factor), + input_size(input_size), + output_size(output_size) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::pixel_shuffle::metax + +#endif // __PIXEL_SHUFFLE_METAX_H__ diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca new file mode 100644 index 000000000..3c8afd6f2 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca @@ -0,0 +1,112 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" +#include "pixel_shuffle_metax.h" + +namespace op::pixel_shuffle::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + if (upscale_factor <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t batch = x_shape[0]; + size_t in_channels = x_shape[1]; + size_t height = x_shape[2]; + size_t width = x_shape[3]; + + if (in_channels % (upscale_factor * upscale_factor) != 0) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t out_channels = in_channels / (upscale_factor * upscale_factor); + size_t out_height = height * upscale_factor; + size_t out_width = width * upscale_factor; + + std::vector expected_y_shape = {batch, out_channels, out_height, out_width}; + if (y_shape != expected_y_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels, + height, width, upscale_factor, + x_desc->numel(), y_desc->numel(), + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto hc_stream = reinterpret_cast(stream); + constexpr int BLOCK_SIZE = 256; + size_t total = output_size; + if (total == 0) { + return INFINI_STATUS_SUCCESS; + } + int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_BF16: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_F32: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_F64: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::pixel_shuffle::metax diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h new file mode 100644 index 000000000..e47a3edb2 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h @@ -0,0 +1,56 @@ +#ifndef __PIXEL_SHUFFLE_MOORE_H__ +#define __PIXEL_SHUFFLE_MOORE_H__ + +#include "../../../operator.h" + +namespace op::pixel_shuffle::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t batch; + size_t in_channels; + size_t out_channels; + size_t height; + size_t width; + int upscale_factor; + size_t input_size; + size_t output_size; + + Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels, + size_t height, size_t width, int upscale_factor, + size_t input_size, size_t output_size, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + batch(batch), + in_channels(in_channels), + out_channels(out_channels), + height(height), + width(width), + upscale_factor(upscale_factor), + input_size(input_size), + output_size(output_size) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::pixel_shuffle::moore + +#endif // __PIXEL_SHUFFLE_MOORE_H__ diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu new file mode 100644 index 000000000..5d2c113d6 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu @@ -0,0 +1,112 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" +#include "pixel_shuffle_moore.h" + +namespace op::pixel_shuffle::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + if (upscale_factor <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t batch = x_shape[0]; + size_t in_channels = x_shape[1]; + size_t height = x_shape[2]; + size_t width = x_shape[3]; + + if (in_channels % (upscale_factor * upscale_factor) != 0) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t out_channels = in_channels / (upscale_factor * upscale_factor); + size_t out_height = height * upscale_factor; + size_t out_width = width * upscale_factor; + + std::vector expected_y_shape = {batch, out_channels, out_height, out_width}; + if (y_shape != expected_y_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels, + height, width, upscale_factor, + x_desc->numel(), y_desc->numel(), + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto musa_stream = reinterpret_cast(stream); + constexpr int BLOCK_SIZE = 256; + size_t total = output_size; + if (total == 0) { + return INFINI_STATUS_SUCCESS; + } + int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_BF16: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_F32: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + case INFINI_DTYPE_F64: + cuda::pixel_shuffle_kernel<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, height * upscale_factor, width * upscale_factor, + upscale_factor); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::pixel_shuffle::moore diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu new file mode 100644 index 000000000..af2d006ae --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu @@ -0,0 +1,129 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../../../tensor.h" +#include "../cuda/kernel.cuh" +#include "pixel_shuffle_nvidia.cuh" +#include + +namespace op::pixel_shuffle::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + + auto dtype = x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + if (y_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (upscale_factor <= 0) { + return INFINI_STATUS_BAD_PARAM; + } + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + + if (x_shape.size() != 4 || y_shape.size() != 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (x_desc->hasBroadcastDim() || y_desc->hasBroadcastDim()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + size_t batch = x_shape[0]; + size_t in_channels = x_shape[1]; + size_t height = x_shape[2]; + size_t width = x_shape[3]; + + if (in_channels % (upscale_factor * upscale_factor) != 0) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t out_channels = in_channels / (upscale_factor * upscale_factor); + size_t out_height = height * upscale_factor; + size_t out_width = width * upscale_factor; + + std::vector expected_y_shape = {batch, out_channels, out_height, out_width}; + if (y_shape != expected_y_shape) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + std::array x_strides = {x_desc->stride(0), x_desc->stride(1), x_desc->stride(2), x_desc->stride(3)}; + std::array y_strides = {y_desc->stride(0), y_desc->stride(1), y_desc->stride(2), y_desc->stride(3)}; + + *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels, + height, width, upscale_factor, + x_desc->numel(), y_desc->numel(), + x_strides, y_strides, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + constexpr int BLOCK_SIZE = 256; + const size_t out_height = height * static_cast(upscale_factor); + const size_t out_width = width * static_cast(upscale_factor); + const size_t total = output_size; + if (total == 0) { + return INFINI_STATUS_SUCCESS; + } + int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; + + switch (_dtype) { + case INFINI_DTYPE_F16: + cuda::pixel_shuffle_kernel_strided<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, out_height, out_width, + upscale_factor, + x_strides[0], x_strides[1], x_strides[2], x_strides[3], + y_strides[0], y_strides[1], y_strides[2], y_strides[3]); + break; + case INFINI_DTYPE_BF16: + cuda::pixel_shuffle_kernel_strided<__nv_bfloat16><<>>( + reinterpret_cast<__nv_bfloat16 *>(y), + reinterpret_cast(x), + batch, out_channels, out_height, out_width, + upscale_factor, + x_strides[0], x_strides[1], x_strides[2], x_strides[3], + y_strides[0], y_strides[1], y_strides[2], y_strides[3]); + break; + case INFINI_DTYPE_F32: + cuda::pixel_shuffle_kernel_strided<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, out_height, out_width, + upscale_factor, + x_strides[0], x_strides[1], x_strides[2], x_strides[3], + y_strides[0], y_strides[1], y_strides[2], y_strides[3]); + break; + case INFINI_DTYPE_F64: + cuda::pixel_shuffle_kernel_strided<<>>( + reinterpret_cast(y), + reinterpret_cast(x), + batch, out_channels, out_height, out_width, + upscale_factor, + x_strides[0], x_strides[1], x_strides[2], x_strides[3], + y_strides[0], y_strides[1], y_strides[2], y_strides[3]); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::pixel_shuffle::nvidia diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh new file mode 100644 index 000000000..a5c3b7506 --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh @@ -0,0 +1,64 @@ +#ifndef __PIXEL_SHUFFLE_NVIDIA_H__ +#define __PIXEL_SHUFFLE_NVIDIA_H__ + +#include "../../../operator.h" +#include +#include + +namespace op::pixel_shuffle::nvidia { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + size_t batch; + size_t in_channels; + size_t out_channels; + size_t height; + size_t width; + int upscale_factor; + size_t input_size; + size_t output_size; + std::array x_strides; + std::array y_strides; + + Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels, + size_t height, size_t width, int upscale_factor, + size_t input_size, size_t output_size, + std::array x_strides, + std::array y_strides, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + batch(batch), + in_channels(in_channels), + out_channels(out_channels), + height(height), + width(width), + upscale_factor(upscale_factor), + input_size(input_size), + output_size(output_size), + x_strides(x_strides), + y_strides(y_strides) {} + +public: + ~Descriptor(); + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor); + + size_t workspaceSize() const { return 0; } + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const; +}; + +} // namespace op::pixel_shuffle::nvidia + +#endif // __PIXEL_SHUFFLE_NVIDIA_H__ diff --git a/src/infiniop/ops/pixel_shuffle/operator.cc b/src/infiniop/ops/pixel_shuffle/operator.cc new file mode 100644 index 000000000..3092787ad --- /dev/null +++ b/src/infiniop/ops/pixel_shuffle/operator.cc @@ -0,0 +1,159 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/pixel_shuffle.h" + +#ifdef ENABLE_CPU_API +#include "cpu/pixel_shuffle_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) +#include "nvidia/pixel_shuffle_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/pixel_shuffle_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/pixel_shuffle_moore.h" +#endif + +__INFINI_C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor( + infiniopHandle_t handle, + infiniopPixelShuffleDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int upscale_factor) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::pixel_shuffle::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + upscale_factor) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__INFINI_C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__INFINI_C __export infiniStatus_t infiniopPixelShuffle( + infiniopPixelShuffleDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__INFINI_C __export infiniStatus_t +infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py new file mode 100644 index 000000000..733376b97 --- /dev/null +++ b/test/infiniop/erf.py @@ -0,0 +1,118 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import * +from enum import Enum, auto +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +_TEST_CASES_ = [ + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4, 4), None, None), +] + +_TEST_CASES = _TEST_CASES_ +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + dtype=torch.float16, + sync=None, +): + print( + f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + input = TestTensor(shape, input_stride, dtype, device) + output = TestTensor(shape, output_stride, dtype, device) + + output.update_torch_tensor(torch.erf(input.torch_tensor())) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateErfDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor + ) + ) + + input.destroy_desc() + output.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetErfWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopErf( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.erf(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/erfc.py b/test/infiniop/erfc.py new file mode 100644 index 000000000..b2424496d --- /dev/null +++ b/test/infiniop/erfc.py @@ -0,0 +1,117 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import * +from enum import Enum, auto +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +_TEST_CASES_ = [ + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4, 4), None, None), +] + +_TEST_CASES = _TEST_CASES_ +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + dtype=torch.float16, + sync=None, +): + print( + f"Testing Erfc on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + input = TestTensor(shape, input_stride, dtype, device) + output = TestTensor(shape, output_stride, dtype, device) + + output.update_torch_tensor(torch.erfc(input.torch_tensor())) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateErfcDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor + ) + ) + + input.destroy_desc() + output.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetErfcWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopErfc( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.erfc(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyErfcDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/erfinv.py b/test/infiniop/erfinv.py new file mode 100644 index 000000000..1979ee9c1 --- /dev/null +++ b/test/infiniop/erfinv.py @@ -0,0 +1,118 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import * +from enum import Enum, auto +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +_TEST_CASES_ = [ + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4, 4), None, None), +] + +_TEST_CASES = _TEST_CASES_ +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + dtype=torch.float16, + sync=None, +): + print( + f"Testing Erfinv on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + input = TestTensor(shape, input_stride, dtype, device, scale=1.8, bias=-0.9) + + output = TestTensor(shape, output_stride, dtype, device) + output.update_torch_tensor(torch.erfinv(input.torch_tensor())) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateErfinvDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor + ) + ) + + input.destroy_desc() + output.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetErfinvWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopErfinv( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.erfinv(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyErfinvDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index cd3ad1b82..3675ca8d1 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -1753,3 +1753,165 @@ def silu_and_mul(lib): lib.infiniopDestroySiluAndMulDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + + +@OpRegister.operator +def erf(lib): + lib.infiniopCreateErfDescriptor.restype = c_int32 + lib.infiniopCreateErfDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetErfWorkspaceSize.restype = c_int32 + lib.infiniopGetErfWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopErf.restype = c_int32 + lib.infiniopErf.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyErfDescriptor.restype = c_int32 + lib.infiniopDestroyErfDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def erfc(lib): + lib.infiniopCreateErfcDescriptor.restype = c_int32 + lib.infiniopCreateErfcDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetErfcWorkspaceSize.restype = c_int32 + lib.infiniopGetErfcWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopErfc.restype = c_int32 + lib.infiniopErfc.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyErfcDescriptor.restype = c_int32 + lib.infiniopDestroyErfcDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def erfinv(lib): + lib.infiniopCreateErfinvDescriptor.restype = c_int32 + lib.infiniopCreateErfinvDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetErfinvWorkspaceSize.restype = c_int32 + lib.infiniopGetErfinvWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopErfinv.restype = c_int32 + lib.infiniopErfinv.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyErfinvDescriptor.restype = c_int32 + lib.infiniopDestroyErfinvDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def matrix_power(lib): + lib.infiniopCreateMatrixPowerDescriptor.restype = c_int32 + lib.infiniopCreateMatrixPowerDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int32, + ] + + lib.infiniopGetMatrixPowerWorkspaceSize.restype = c_int32 + lib.infiniopGetMatrixPowerWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMatrixPower.restype = c_int32 + lib.infiniopMatrixPower.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMatrixPowerDescriptor.restype = c_int32 + lib.infiniopDestroyMatrixPowerDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def pixel_shuffle(lib): + lib.infiniopCreatePixelShuffleDescriptor.restype = c_int32 + lib.infiniopCreatePixelShuffleDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int32, + ] + + lib.infiniopGetPixelShuffleWorkspaceSize.restype = c_int32 + lib.infiniopGetPixelShuffleWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopPixelShuffle.restype = c_int32 + lib.infiniopPixelShuffle.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyPixelShuffleDescriptor.restype = c_int32 + lib.infiniopDestroyPixelShuffleDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/matrix_power.py b/test/infiniop/matrix_power.py new file mode 100644 index 000000000..7527fe249 --- /dev/null +++ b/test/infiniop/matrix_power.py @@ -0,0 +1,104 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +_TEST_CASES_ = [ + ((4, 4), None, None), + ((8, 8), None, None), +] + +_POWERS = [2, 3, 4] + +_TEST_CASES = [test_case + (n,) for test_case in _TEST_CASES_ for n in _POWERS] + +_TENSOR_DTYPES = [InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test(handle, device, shape, input_stride, output_stride, n, dtype, sync=None): + print( + f"Testing MatrixPower on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} power:{n} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + if shape[-1] != shape[-2]: + return + + input = TestTensor(shape, input_stride, dtype, device) + output = TestTensor(shape, output_stride, dtype, device) + + output.update_torch_tensor(torch.linalg.matrix_power(input.torch_tensor(), n)) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMatrixPowerDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor, n + ) + ) + + input.destroy_desc() + output.destroy_desc() + + ws = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMatrixPowerWorkspaceSize(descriptor, ctypes.byref(ws)) + ) + workspace = TestWorkspace(ws.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopMatrixPower( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.linalg.matrix_power(input.torch_tensor(), n), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyMatrixPowerDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/pixel_shuffle.py b/test/infiniop/pixel_shuffle.py new file mode 100644 index 000000000..2fb690fe5 --- /dev/null +++ b/test/infiniop/pixel_shuffle.py @@ -0,0 +1,114 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + + +_TEST_CASES = [ + ((1, 4, 8, 8), None, None, 2), + ((2, 16, 4, 4), None, None, 2), + ((2, 16, 4, 4), None, None, 4), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + + +def test(handle, device, shape, input_stride, output_stride, upscale, dtype, sync=None): + print( + f"Testing PixelShuffle on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} upscale:{upscale} " + f"dtype:{InfiniDtypeNames[dtype]}" + ) + r = upscale + assert shape[1] % (r * r) == 0 + + input = TestTensor(shape, input_stride, dtype, device) + + output_shape = ( + shape[0], + shape[1] // (r * r), + shape[2] * r, + shape[3] * r, + ) + + output = TestTensor(output_shape, output_stride, dtype, device) + + output.update_torch_tensor( + torch.nn.functional.pixel_shuffle(input.torch_tensor(), r) + ) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreatePixelShuffleDescriptor( + handle, ctypes.byref(descriptor), output.descriptor, input.descriptor, r + ) + ) + + input.destroy_desc() + output.destroy_desc() + + ws = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetPixelShuffleWorkspaceSize(descriptor, ctypes.byref(ws)) + ) + workspace = TestWorkspace(ws.value, device) + + def lib_f(): + check_error( + LIBINFINIOP.infiniopPixelShuffle( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_f() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.nn.functional.pixel_shuffle(input.torch_tensor(), r), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyPixelShuffleDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mTest passed!\033[0m")