diff --git a/include/infiniop.h b/include/infiniop.h
index fe9537876..104476037 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -30,6 +30,9 @@
 #include "infiniop/ops/dequantize_gptq.h"
 #include "infiniop/ops/embedding.h"
 #include "infiniop/ops/equal.h"
+#include "infiniop/ops/erf.h"
+#include "infiniop/ops/erfc.h"
+#include "infiniop/ops/erfinv.h"
 #include "infiniop/ops/flash_attention.h"
 #include "infiniop/ops/flipud.h"
 #include "infiniop/ops/float_power.h"
@@ -58,12 +61,14 @@
 #include "infiniop/ops/logcumsumexp.h"
 #include "infiniop/ops/lp_norm.h"
 #include "infiniop/ops/masked_select.h"
+#include "infiniop/ops/matrix_power.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/multi_margin_loss.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
 #include "infiniop/ops/paged_attention_prefill.h"
 #include "infiniop/ops/paged_caching.h"
+#include "infiniop/ops/pixel_shuffle.h"
 #include "infiniop/ops/quant/per_channel_quant_int8.h"
 #include "infiniop/ops/quant/per_tensor_quant_int8.h"
 #include "infiniop/ops/random_sample.h"
diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h
new file mode 100644
index 000000000..8786b31a3
--- /dev/null
+++ b/include/infiniop/ops/erf.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ERF_API_H__
+#define __INFINIOP_ERF_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopErfDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
+                                                               infiniopErfDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t y,
+                                                               infiniopTensorDescriptor_t x);
+
+__INFINI_C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *y,
+                                               const void *x,
+                                               void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/erfc.h b/include/infiniop/ops/erfc.h
new file mode 100644
index 000000000..6bceebe35
--- /dev/null
+++ b/include/infiniop/ops/erfc.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ERFC_API_H__
+#define __INFINIOP_ERFC_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopErfcDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle,
+                                                                infiniopErfcDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x);
+
+__INFINI_C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                const void *x,
+                                                void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/erfinv.h b/include/infiniop/ops/erfinv.h
new file mode 100644
index 000000000..b14975253
--- /dev/null
+++ b/include/infiniop/ops/erfinv.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ERFINV_API_H__
+#define __INFINIOP_ERFINV_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopErfinvDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle,
+                                                                  infiniopErfinvDescriptor_t *desc_ptr,
+                                                                  infiniopTensorDescriptor_t y,
+                                                                  infiniopTensorDescriptor_t x);
+
+__INFINI_C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc,
+                                                  void *workspace,
+                                                  size_t workspace_size,
+                                                  void *y,
+                                                  const void *x,
+                                                  void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/matrix_power.h b/include/infiniop/ops/matrix_power.h
new file mode 100644
index 000000000..639d3cf02
--- /dev/null
+++ b/include/infiniop/ops/matrix_power.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_MATRIX_POWER_API_H__
+#define __INFINIOP_MATRIX_POWER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMatrixPowerDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle,
+                                                                       infiniopMatrixPowerDescriptor_t *desc_ptr,
+                                                                       infiniopTensorDescriptor_t y,
+                                                                       infiniopTensorDescriptor_t x,
+                                                                       int n);
+
+__INFINI_C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc,
+                                                       void *workspace,
+                                                       size_t workspace_size,
+                                                       void *y,
+                                                       const void *x,
+                                                       void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/pixel_shuffle.h b/include/infiniop/ops/pixel_shuffle.h
new file mode 100644
index 000000000..fac8eff2c
--- /dev/null
+++ b/include/infiniop/ops/pixel_shuffle.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_PIXEL_SHUFFLE_API_H__
+#define __INFINIOP_PIXEL_SHUFFLE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopPixelShuffleDescriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle,
+                                                                        infiniopPixelShuffleDescriptor_t *desc_ptr,
+                                                                        infiniopTensorDescriptor_t y,
+                                                                        infiniopTensorDescriptor_t x,
+                                                                        int upscale_factor);
+
+__INFINI_C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc,
+                                                        void *workspace,
+                                                        size_t workspace_size,
+                                                        void *y,
+                                                        const void *x,
+                                                        void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc
new file mode 100644
index 000000000..7d127bfae
--- /dev/null
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc
@@ -0,0 +1,52 @@
+#include "erf_cpu.h"
+
+namespace op::erf::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ErfOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ErfOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ErfOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ErfOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erf::cpu
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h
new file mode 100644
index 000000000..74ad19e57
--- /dev/null
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __ERF_CPU_H__
+#define __ERF_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(erf, cpu)
+
+namespace op::erf::cpu {
+typedef struct ErfOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::erf(x);
+    }
+} ErfOp;
+} // namespace op::erf::cpu
+
+#endif // __ERF_CPU_H__
diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh
new file mode 100644
index 000000000..8efe037e3
--- /dev/null
+++ b/src/infiniop/ops/erf/cuda/kernel.cuh
@@ -0,0 +1,33 @@
+#pragma once
+#include <cmath>
+#include <type_traits>
+
+namespace op::cuda {
+
+struct ErfOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(T x) const {
+        if constexpr (std::is_same_v<T, float>) {
+            return erff(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::erf(x);
+        } else {
+            // For F16/BF16: promote to float, compute, then cast back
+            float xf;
+            if constexpr (std::is_same_v<T, half>) {
+                xf = __half2float(x);
+                return __float2half_rn(erff(xf));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                xf = __bfloat162float(x);
+                return __float2bfloat16_rn(erff(xf));
+            } else {
+                xf = static_cast<float>(x);
+                return static_cast<T>(erff(xf));
+            }
+        }
+    }
+};
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/erf/erf.h b/src/infiniop/ops/erf/erf.h
new file mode 100644
index 000000000..7c967dea2
--- /dev/null
+++ b/src/infiniop/ops/erf/erf.h
@@ -0,0 +1,8 @@
+#ifndef __ERF_H__
+#define __ERF_H__
+
+#include "../../elementwise/elementwise.h"
+
+#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erf, NAMESPACE)
+
+#endif // __ERF_H__
diff --git a/src/infiniop/ops/erf/metax/erf_metax.h b/src/infiniop/ops/erf/metax/erf_metax.h
new file mode 100644
index 000000000..5dfe23bbe
--- /dev/null
+++ b/src/infiniop/ops/erf/metax/erf_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ERF_METAX_API_H__
+#define __ERF_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erf, metax)
+
+#endif // __ERF_METAX_API_H__
diff --git a/src/infiniop/ops/erf/metax/erf_metax.maca b/src/infiniop/ops/erf/metax/erf_metax.maca
new file mode 100644
index 000000000..fcf956af2
--- /dev/null
+++ b/src/infiniop/ops/erf/metax/erf_metax.maca
@@ -0,0 +1,60 @@
+#include "erf_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erf::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erf::metax
diff --git a/src/infiniop/ops/erf/moore/erf_moore.h b/src/infiniop/ops/erf/moore/erf_moore.h
new file mode 100644
index 000000000..620055688
--- /dev/null
+++ b/src/infiniop/ops/erf/moore/erf_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ERF_MOORE_API_H__
+#define __ERF_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erf, moore)
+
+#endif // __ERF_MOORE_API_H__
diff --git a/src/infiniop/ops/erf/moore/erf_moore.mu b/src/infiniop/ops/erf/moore/erf_moore.mu
new file mode 100644
index 000000000..856764c70
--- /dev/null
+++ b/src/infiniop/ops/erf/moore/erf_moore.mu
@@ -0,0 +1,60 @@
+#include "erf_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erf::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erf::moore
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
new file mode 100644
index 000000000..03e14bb57
--- /dev/null
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
@@ -0,0 +1,58 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "erf_nvidia.cuh"
+
+namespace op::erf::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erf::nvidia
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
new file mode 100644
index 000000000..d20658027
--- /dev/null
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ERF_NVIDIA_H__
+#define __ERF_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(erf, nvidia)
+
+#endif // __ERF_NVIDIA_H__
diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc
new file mode 100644
index 000000000..7ed264e33
--- /dev/null
+++ b/src/infiniop/ops/erf/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/erf.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/erf_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/erf_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/erf_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/erf_moore.h"
+#endif
+
+__INFINI_C __export infiniStatus_t infiniopCreateErfDescriptor(
+    infiniopHandle_t handle,
+    infiniopErfDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::erf::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::erf::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::erf::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C __export infiniStatus_t infiniopErf(
+    infiniopErfDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C __export infiniStatus_t
+infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/erfc/cpu/erfc_cpu.cc b/src/infiniop/ops/erfc/cpu/erfc_cpu.cc
new file mode 100644
index 000000000..35b82c678
--- /dev/null
+++ b/src/infiniop/ops/erfc/cpu/erfc_cpu.cc
@@ -0,0 +1,52 @@
+#include "erfc_cpu.h"
+
+namespace op::erfc::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ErfcOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ErfcOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ErfcOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ErfcOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfc::cpu
diff --git a/src/infiniop/ops/erfc/cpu/erfc_cpu.h b/src/infiniop/ops/erfc/cpu/erfc_cpu.h
new file mode 100644
index 000000000..dd6d69496
--- /dev/null
+++ b/src/infiniop/ops/erfc/cpu/erfc_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __ERFC_CPU_H__
+#define __ERFC_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(erfc, cpu)
+
+namespace op::erfc::cpu {
+typedef struct ErfcOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::erfc(x);
+    }
+} ErfcOp;
+} // namespace op::erfc::cpu
+
+#endif // __ERFC_CPU_H__
diff --git a/src/infiniop/ops/erfc/cuda/kernel.cuh b/src/infiniop/ops/erfc/cuda/kernel.cuh
new file mode 100644
index 000000000..5a448919d
--- /dev/null
+++ b/src/infiniop/ops/erfc/cuda/kernel.cuh
@@ -0,0 +1,33 @@
+#pragma once
+#include <cmath>
+#include <type_traits>
+
+namespace op::cuda {
+
+struct ErfcOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(T x) const {
+        if constexpr (std::is_same_v<T, float>) {
+            return erfcf(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::erfc(x);
+        } else {
+            // For F16/BF16: promote to float, compute, then cast back
+            float xf;
+            if constexpr (std::is_same_v<T, half>) {
+                xf = __half2float(x);
+                return __float2half_rn(erfcf(xf));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                xf = __bfloat162float(x);
+                return __float2bfloat16_rn(erfcf(xf));
+            } else {
+                xf = static_cast<float>(x);
+                return static_cast<T>(erfcf(xf));
+            }
+        }
+    }
+};
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/erfc/erfc.h b/src/infiniop/ops/erfc/erfc.h
new file mode 100644
index 000000000..9ee12fd43
--- /dev/null
+++ b/src/infiniop/ops/erfc/erfc.h
@@ -0,0 +1,8 @@
+#ifndef __ERFC_H__
+#define __ERFC_H__
+
+#include "../../elementwise/elementwise.h"
+
+#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erfc, NAMESPACE)
+
+#endif // __ERFC_H__
diff --git a/src/infiniop/ops/erfc/metax/erfc_metax.h b/src/infiniop/ops/erfc/metax/erfc_metax.h
new file mode 100644
index 000000000..438f00095
--- /dev/null
+++ b/src/infiniop/ops/erfc/metax/erfc_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ERFC_METAX_API_H__
+#define __ERFC_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erfc, metax)
+
+#endif // __ERFC_METAX_API_H__
diff --git a/src/infiniop/ops/erfc/metax/erfc_metax.maca b/src/infiniop/ops/erfc/metax/erfc_metax.maca
new file mode 100644
index 000000000..925e52ccf
--- /dev/null
+++ b/src/infiniop/ops/erfc/metax/erfc_metax.maca
@@ -0,0 +1,60 @@
+#include "erfc_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erfc::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfc::metax
diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.h b/src/infiniop/ops/erfc/moore/erfc_moore.h
new file mode 100644
index 000000000..d032e4305
--- /dev/null
+++ b/src/infiniop/ops/erfc/moore/erfc_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ERFC_MOORE_API_H__
+#define __ERFC_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erfc, moore)
+
+#endif // __ERFC_MOORE_API_H__
diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.mu b/src/infiniop/ops/erfc/moore/erfc_moore.mu
new file mode 100644
index 000000000..d58243e50
--- /dev/null
+++ b/src/infiniop/ops/erfc/moore/erfc_moore.mu
@@ -0,0 +1,60 @@
+#include "erfc_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erfc::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfc::moore
diff --git a/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu
new file mode 100644
index 000000000..483f11a18
--- /dev/null
+++ b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu
@@ -0,0 +1,58 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "erfc_nvidia.cuh"
+
+namespace op::erfc::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erfc::nvidia
diff --git a/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh
new file mode 100644
index 000000000..4d5321c9d
--- /dev/null
+++ b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ERFC_NVIDIA_H__
+#define __ERFC_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(erfc, nvidia)
+
+#endif // __ERFC_NVIDIA_H__
diff --git a/src/infiniop/ops/erfc/operator.cc b/src/infiniop/ops/erfc/operator.cc
new file mode 100644
index 000000000..ebd692349
--- /dev/null
+++ b/src/infiniop/ops/erfc/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/erfc.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/erfc_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/erfc_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/erfc_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/erfc_moore.h"
+#endif
+
+__INFINI_C __export infiniStatus_t infiniopCreateErfcDescriptor(
+    infiniopHandle_t handle,
+    infiniopErfcDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::erfc::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::erfc::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::erfc::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C __export infiniStatus_t infiniopErfc(
+    infiniopErfcDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::erfc::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C __export infiniStatus_t
+infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::erfc::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc
new file mode 100644
index 000000000..16c5c8cba
--- /dev/null
+++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc
@@ -0,0 +1,52 @@
+#include "erfinv_cpu.h"
+
+namespace op::erfinv::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ErfinvOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ErfinvOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ErfinvOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ErfinvOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfinv::cpu
diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h
new file mode 100644
index 000000000..e7b1b723d
--- /dev/null
+++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h
@@ -0,0 +1,56 @@
+#ifndef __ERFINV_CPU_H__
+#define __ERFINV_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+#include <limits>
+
+ELEMENTWISE_DESCRIPTOR(erfinv, cpu)
+
+namespace op::erfinv::cpu {
+
+// Inverse error function implementation using Newton's method
+template <typename T>
+T erfinv_impl(T x) {
+    // Domain: x in (-1, 1)
+    if (x == 1.0) {
+        return std::numeric_limits<T>::infinity();
+    }
+    if (x == -1.0) {
+        return -std::numeric_limits<T>::infinity();
+    }
+    if (x > 1.0 || x < -1.0) {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+    if (x == 0.0) {
+        return 0.0;
+    }
+
+    // Use Newton's method to solve erf(y) = x
+    T y = x; // Initial guess
+    const int max_iter = 10;
+    const T tol = static_cast<T>(1e-10);
+
+    for (int i = 0; i < max_iter; ++i) {
+        T erf_y = std::erf(y);
+        T derf_dy = T(2.0) / T(std::sqrt(3.14159265358979323846) * std::exp(-y * y));
+        T error = erf_y - x;
+        if (std::abs(error) < tol) {
+            break;
+        }
+        y = y - error / derf_dy;
+    }
+    return y;
+}
+
+typedef struct ErfinvOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return erfinv_impl(x);
+    }
+} ErfinvOp;
+} // namespace op::erfinv::cpu
+
+#endif // __ERFINV_CPU_H__
diff --git a/src/infiniop/ops/erfinv/cuda/kernel.cuh b/src/infiniop/ops/erfinv/cuda/kernel.cuh
new file mode 100644
index 000000000..18025331a
--- /dev/null
+++ b/src/infiniop/ops/erfinv/cuda/kernel.cuh
@@ -0,0 +1,128 @@
+#pragma once
+#include <cmath>
+#include <limits>
+#include <type_traits>
+
+namespace op::cuda {
+
+constexpr float PI_F = 3.14159265358979323846f;
+constexpr double PI = 3.14159265358979323846;
+
+// Inverse error function.
+__device__ __forceinline__ float erfinv_impl(float x) {
+    if (x == 1.0f) {
+        return std::numeric_limits<float>::infinity();
+    }
+    if (x == -1.0f) {
+        return -std::numeric_limits<float>::infinity();
+    }
+    if (x > 1.0f || x < -1.0f) {
+        return std::numeric_limits<float>::quiet_NaN();
+    }
+    if (x == 0.0f) {
+        return 0.0f;
+    }
+
+    // Winitzki approximation
+    const float a = 0.147f;
+    const float ln = log1pf(-x * x);
+    const float t = 2.0f / (PI_F * a) + ln * 0.5f;
+
+    float inside = t * t - ln / a;
+    inside = inside > 0.0f ? inside : 0.0f;
+
+    float y0 = copysignf(sqrtf(sqrtf(inside) - t), x);
+
+    float y = y0;
+    const float sqrt_pi_f = sqrtf(PI_F);
+
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        const float erf_y = erff(y);
+        const float derf_dy = 2.0f / sqrt_pi_f * expf(-y * y);
+        y = y - (erf_y - x) / derf_dy;
+    }
+
+    // Slow path near |x| ~ 1
+    const float ax = fabsf(x);
+    if (1.0f - ax < 1e-4f) {
+        const double xd = static_cast<double>(x);
+        double yd = static_cast<double>(y);
+        const double sqrt_pi = sqrt(PI);
+
+#pragma unroll
+        for (int i = 0; i < 4; ++i) {
+            const double erf_y = erf(yd);
+            const double derf_dy = 2.0 / sqrt_pi * exp(-yd * yd);
+            yd = yd - (erf_y - xd) / derf_dy;
+        }
+        y = static_cast<float>(yd);
+    }
+
+    return y;
+}
+
+struct ErfinvOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(T x) const {
+        if constexpr (std::is_same_v<T, float>) {
+            return erfinv_impl(x);
+
+        } else if constexpr (std::is_same_v<T, double>) {
+            if (x == 1.0) {
+                return std::numeric_limits<double>::infinity();
+            }
+            if (x == -1.0) {
+                return -std::numeric_limits<double>::infinity();
+            }
+            if (x > 1.0 || x < -1.0) {
+                return std::numeric_limits<double>::quiet_NaN();
+            }
+            if (x == 0.0) {
+                return 0.0;
+            }
+
+            const double a = 0.147;
+            const double ln = log1p(-x * x);
+            const double t = 2.0 / (PI * a) + ln * 0.5;
+
+            double inside = t * t - ln / a;
+            inside = inside > 0.0 ? inside : 0.0;
+
+            double y = copysign(sqrt(sqrt(inside) - t), x);
+
+            const int max_iter = 30;
+            const double tol = 1e-14;
+            const double sqrt_pi = sqrt(PI);
+
+            for (int i = 0; i < max_iter; ++i) {
+                const double erf_y = erf(y);
+                const double error = erf_y - x;
+                if (fabs(error) < tol) {
+                    break;
+                }
+                const double derf_dy = 2.0 / sqrt_pi * exp(-y * y);
+                y = y - error / derf_dy;
+            }
+            return y;
+
+        } else {
+            // F16 / BF16 / other types
+            float xf;
+            if constexpr (std::is_same_v<T, half>) {
+                xf = __half2float(x);
+                return __float2half_rn(erfinv_impl(xf));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                xf = __bfloat162float(x);
+                return __float2bfloat16_rn(erfinv_impl(xf));
+            } else {
+                xf = static_cast<float>(x);
+                return static_cast<T>(erfinv_impl(xf));
+            }
+        }
+    }
+};
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/erfinv/erfinv.h b/src/infiniop/ops/erfinv/erfinv.h
new file mode 100644
index 000000000..f3ed9350f
--- /dev/null
+++ b/src/infiniop/ops/erfinv/erfinv.h
@@ -0,0 +1,8 @@
+#ifndef __ERFINV_H__
+#define __ERFINV_H__
+
+#include "../../elementwise/elementwise.h"
+
+#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erfinv, NAMESPACE)
+
+#endif // __ERFINV_H__
diff --git a/src/infiniop/ops/erfinv/metax/erfinv_metax.h b/src/infiniop/ops/erfinv/metax/erfinv_metax.h
new file mode 100644
index 000000000..05058bfc6
--- /dev/null
+++ b/src/infiniop/ops/erfinv/metax/erfinv_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ERFINV_METAX_API_H__
+#define __ERFINV_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erfinv, metax)
+
+#endif // __ERFINV_METAX_API_H__
diff --git a/src/infiniop/ops/erfinv/metax/erfinv_metax.maca b/src/infiniop/ops/erfinv/metax/erfinv_metax.maca
new file mode 100644
index 000000000..1e9144074
--- /dev/null
+++ b/src/infiniop/ops/erfinv/metax/erfinv_metax.maca
@@ -0,0 +1,60 @@
+#include "erfinv_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erfinv::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfinv::metax
diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.h b/src/infiniop/ops/erfinv/moore/erfinv_moore.h
new file mode 100644
index 000000000..9eed18024
--- /dev/null
+++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ERFINV_MOORE_API_H__
+#define __ERFINV_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erfinv, moore)
+
+#endif // __ERFINV_MOORE_API_H__
diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.mu b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu
new file mode 100644
index 000000000..26febe69d
--- /dev/null
+++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu
@@ -0,0 +1,60 @@
+#include "erfinv_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erfinv::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfinv::moore
diff --git a/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu
new file mode 100644
index 000000000..35f5d3fe2
--- /dev/null
+++ b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu
@@ -0,0 +1,58 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "erfinv_nvidia.cuh"
+
+namespace op::erfinv::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erfinv::nvidia
diff --git a/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh
new file mode 100644
index 000000000..af80be12f
--- /dev/null
+++ b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ERFINV_NVIDIA_H__
+#define __ERFINV_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(erfinv, nvidia)
+
+#endif // __ERFINV_NVIDIA_H__
diff --git a/src/infiniop/ops/erfinv/operator.cc b/src/infiniop/ops/erfinv/operator.cc
new file mode 100644
index 000000000..c0453562b
--- /dev/null
+++ b/src/infiniop/ops/erfinv/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/erfinv.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/erfinv_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/erfinv_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/erfinv_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/erfinv_moore.h"
+#endif
+
+__INFINI_C __export infiniStatus_t infiniopCreateErfinvDescriptor(
+    infiniopHandle_t handle,
+    infiniopErfinvDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                \
+        return op::erfinv::NAMESPACE::Descriptor::create(                     \
+            handle,                                                           \
+            reinterpret_cast<op::erfinv::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                           \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                \
+        *size = reinterpret_cast<op::erfinv::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C __export infiniStatus_t infiniopErfinv(
+    infiniopErfinvDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                   \
+        return reinterpret_cast<const op::erfinv::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C __export infiniStatus_t
+infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        delete reinterpret_cast<const op::erfinv::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc
new file mode 100644
index 000000000..6c2fe60cd
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc
@@ -0,0 +1,201 @@
+#include "matrix_power_cpu.h"
+#include "../../../tensor.h"
+#include <algorithm>
+#include <cstring>
+
+namespace op::matrix_power::cpu {
+
+utils::Result<MatrixPowerInfo> MatrixPowerInfo::create(
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    int n) {
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (y_shape != x_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (n < 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    MatrixPowerInfo info;
+    info.matrix_size = x_shape[0];
+    info.n = static_cast<size_t>(n);
+    info.input_size = x_desc->numel();
+    info.output_size = y_desc->numel();
+
+    return utils::Result<MatrixPowerInfo>(std::move(info));
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    auto info_result = MatrixPowerInfo::create(x_desc, y_desc, n);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void matrix_power_impl(
+    const MatrixPowerInfo &info,
+    T *y,
+    const T *x,
+    void *workspace) {
+
+    size_t n = info.matrix_size;
+    int power = static_cast<int>(info.n);
+
+    // Use workspace for temporary matrices
+    T *temp1 = reinterpret_cast<T *>(workspace);
+    T *temp2 = temp1 + n * n;
+
+    // Initialize result as identity matrix
+    std::memset(y, 0, n * n * sizeof(T));
+    for (size_t i = 0; i < n; ++i) {
+        y[i * n + i] = utils::cast<T>(1.0);
+    }
+
+    // Copy input to temp1
+    std::memcpy(temp1, x, n * n * sizeof(T));
+
+    // Binary exponentiation
+    while (power > 0) {
+        if (power & 1) {
+            // Multiply result by temp1
+            std::memset(temp2, 0, n * n * sizeof(T));
+            for (size_t i = 0; i < n; ++i) {
+                for (size_t k = 0; k < n; ++k) {
+                    T val = y[i * n + k];
+                    for (size_t j = 0; j < n; ++j) {
+                        temp2[i * n + j] += val * temp1[k * n + j];
+                    }
+                }
+            }
+            std::memcpy(y, temp2, n * n * sizeof(T));
+        }
+        // Square temp1
+        std::memset(temp2, 0, n * n * sizeof(T));
+        for (size_t i = 0; i < n; ++i) {
+            for (size_t k = 0; k < n; ++k) {
+                T val = temp1[i * n + k];
+                for (size_t j = 0; j < n; ++j) {
+                    temp2[i * n + j] += val * temp1[k * n + j];
+                }
+            }
+        }
+        std::memcpy(temp1, temp2, n * n * sizeof(T));
+        power >>= 1;
+    }
+}
+
+template <typename T>
+void write_identity_impl(size_t n, T *y) {
+    std::fill(y, y + n * n, utils::cast<T>(0.0));
+    for (size_t i = 0; i < n; ++i) {
+        y[i * n + i] = utils::cast<T>(1.0);
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (_info.matrix_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    if (_info.n == 0) {
+        const size_t n = _info.matrix_size;
+        switch (_dtype) {
+        case INFINI_DTYPE_F16:
+            write_identity_impl<fp16_t>(n, reinterpret_cast<fp16_t *>(y));
+            return INFINI_STATUS_SUCCESS;
+        case INFINI_DTYPE_BF16:
+            write_identity_impl<bf16_t>(n, reinterpret_cast<bf16_t *>(y));
+            return INFINI_STATUS_SUCCESS;
+        case INFINI_DTYPE_F32:
+            write_identity_impl<float>(n, reinterpret_cast<float *>(y));
+            return INFINI_STATUS_SUCCESS;
+        case INFINI_DTYPE_F64:
+            write_identity_impl<double>(n, reinterpret_cast<double *>(y));
+            return INFINI_STATUS_SUCCESS;
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    }
+
+    if (workspace_size < this->workspaceSize()) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    if (this->workspaceSize() > 0 && workspace == nullptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16: {
+        // Convert to float for computation
+        std::vector<float> x_f(_info.input_size);
+        std::vector<float> y_f(_info.output_size);
+        float *workspace_f = reinterpret_cast<float *>(workspace);
+        for (size_t i = 0; i < _info.input_size; ++i) {
+            x_f[i] = utils::cast<float>(reinterpret_cast<const fp16_t *>(x)[i]);
+        }
+        MatrixPowerInfo info_f = _info;
+        matrix_power_impl<float>(info_f, y_f.data(), x_f.data(), workspace_f);
+        for (size_t i = 0; i < _info.output_size; ++i) {
+            reinterpret_cast<fp16_t *>(y)[i] = utils::cast<fp16_t>(y_f[i]);
+        }
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        std::vector<float> x_f(_info.input_size);
+        std::vector<float> y_f(_info.output_size);
+        float *workspace_f = reinterpret_cast<float *>(workspace);
+        for (size_t i = 0; i < _info.input_size; ++i) {
+            x_f[i] = utils::cast<float>(reinterpret_cast<const bf16_t *>(x)[i]);
+        }
+        MatrixPowerInfo info_f = _info;
+        matrix_power_impl<float>(info_f, y_f.data(), x_f.data(), workspace_f);
+        for (size_t i = 0; i < _info.output_size; ++i) {
+            reinterpret_cast<bf16_t *>(y)[i] = utils::cast<bf16_t>(y_f[i]);
+        }
+        break;
+    }
+    case INFINI_DTYPE_F32:
+        matrix_power_impl<float>(_info, reinterpret_cast<float *>(y), reinterpret_cast<const float *>(x), workspace);
+        break;
+    case INFINI_DTYPE_F64:
+        matrix_power_impl<double>(_info, reinterpret_cast<double *>(y), reinterpret_cast<const double *>(x), workspace);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::matrix_power::cpu
diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h
new file mode 100644
index 000000000..06a14c341
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h
@@ -0,0 +1,69 @@
+#ifndef __MATRIX_POWER_CPU_H__
+#define __MATRIX_POWER_CPU_H__
+
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../operator.h"
+#include <vector>
+
+namespace op::matrix_power::cpu {
+
+struct MatrixPowerInfo {
+    size_t matrix_size; // N x N matrix
+    size_t n;           // Power
+    size_t input_size;
+    size_t output_size;
+
+    static utils::Result<MatrixPowerInfo> create(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        int n);
+};
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    MatrixPowerInfo _info;
+
+    Descriptor(infiniDtype_t dtype, MatrixPowerInfo info,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int n);
+
+    size_t workspaceSize() const {
+        if (_info.n == 0 || _info.matrix_size == 0) {
+            return 0;
+        }
+        const size_t elems = 2 * _info.matrix_size * _info.matrix_size;
+        switch (_dtype) {
+        case INFINI_DTYPE_F16:
+        case INFINI_DTYPE_BF16:
+        case INFINI_DTYPE_F32:
+            return elems * sizeof(float);
+        case INFINI_DTYPE_F64:
+            return elems * sizeof(double);
+        default:
+            return 0;
+        }
+    }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::matrix_power::cpu
+
+#endif // __MATRIX_POWER_CPU_H__
diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h
new file mode 100644
index 000000000..43e561f66
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h
@@ -0,0 +1,47 @@
+#ifndef __MATRIX_POWER_METAX_H__
+#define __MATRIX_POWER_METAX_H__
+
+#include "../../../operator.h"
+
+namespace op::matrix_power::metax {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t matrix_size;
+    size_t n;
+    size_t input_size;
+    size_t output_size;
+
+    Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n,
+               size_t input_size, size_t output_size,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          matrix_size(matrix_size),
+          n(n),
+          input_size(input_size),
+          output_size(output_size) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int n);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::matrix_power::metax
+
+#endif // __MATRIX_POWER_METAX_H__
diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca
new file mode 100644
index 000000000..221757141
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca
@@ -0,0 +1,107 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../tensor.h"
+#include "matrix_power_metax.h"
+#include <cstring>
+#include <vector>
+
+namespace op::matrix_power::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (y_shape != x_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    CHECK_OR_RETURN(n >= 0, INFINI_STATUS_BAD_PARAM);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    *desc_ptr = new Descriptor(dtype, x_shape[0], static_cast<size_t>(n),
+                               x_desc->numel(), y_desc->numel(),
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (workspace_size < this->workspaceSize()) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    CHECK_OR_RETURN(_dtype == INFINI_DTYPE_F32, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    size_t input_bytes = input_size * sizeof(float);
+
+    // Use CPU fallback for now
+    std::vector<float> h_matrix(input_size);
+    CHECK_METAX(hcMemcpyAsync(h_matrix.data(), x, input_bytes, hcMemcpyDeviceToHost, hc_stream));
+    CHECK_METAX(hcStreamSynchronize(hc_stream));
+
+    std::vector<float> result(output_size, 0.0f);
+    std::vector<float> temp1(input_size);
+    std::vector<float> temp2(input_size);
+    std::memcpy(temp1.data(), h_matrix.data(), input_bytes);
+
+    for (size_t i = 0; i < matrix_size; ++i) {
+        result[i * matrix_size + i] = 1.0f;
+    }
+
+    int power = static_cast<int>(n);
+    while (power > 0) {
+        if (power & 1) {
+            std::fill(temp2.begin(), temp2.end(), 0.0f);
+            for (size_t i = 0; i < matrix_size; ++i) {
+                for (size_t k = 0; k < matrix_size; ++k) {
+                    float val = result[i * matrix_size + k];
+                    for (size_t j = 0; j < matrix_size; ++j) {
+                        temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j];
+                    }
+                }
+            }
+            std::memcpy(result.data(), temp2.data(), output_size * sizeof(float));
+        }
+        std::fill(temp2.begin(), temp2.end(), 0.0f);
+        for (size_t i = 0; i < matrix_size; ++i) {
+            for (size_t k = 0; k < matrix_size; ++k) {
+                float val = temp1[i * matrix_size + k];
+                for (size_t j = 0; j < matrix_size; ++j) {
+                    temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j];
+                }
+            }
+        }
+        std::memcpy(temp1.data(), temp2.data(), input_bytes);
+        power >>= 1;
+    }
+
+    CHECK_METAX(hcMemcpyAsync(y, result.data(), input_bytes, hcMemcpyHostToDevice, hc_stream));
+    CHECK_METAX(hcStreamSynchronize(hc_stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::matrix_power::metax
diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h
new file mode 100644
index 000000000..42dd5c180
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h
@@ -0,0 +1,47 @@
+#ifndef __MATRIX_POWER_MOORE_H__
+#define __MATRIX_POWER_MOORE_H__
+
+#include "../../../operator.h"
+
+namespace op::matrix_power::moore {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t matrix_size;
+    size_t n;
+    size_t input_size;
+    size_t output_size;
+
+    Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n,
+               size_t input_size, size_t output_size,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          matrix_size(matrix_size),
+          n(n),
+          input_size(input_size),
+          output_size(output_size) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int n);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::matrix_power::moore
+
+#endif // __MATRIX_POWER_MOORE_H__
diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu
new file mode 100644
index 000000000..938ad5c0a
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu
@@ -0,0 +1,106 @@
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "../../../tensor.h"
+#include "matrix_power_moore.h"
+#include <cstring>
+#include <vector>
+
+namespace op::matrix_power::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (y_shape != x_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    CHECK_OR_RETURN(n >= 0, INFINI_STATUS_BAD_PARAM);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    *desc_ptr = new Descriptor(dtype, x_shape[0], static_cast<size_t>(n),
+                               x_desc->numel(), y_desc->numel(),
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (workspace_size < this->workspaceSize()) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    CHECK_OR_RETURN(_dtype == INFINI_DTYPE_F32, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    size_t input_bytes = input_size * sizeof(float);
+
+    std::vector<float> h_matrix(input_size);
+    CHECK_MOORE(musaMemcpyAsync(h_matrix.data(), x, input_bytes, musaMemcpyDeviceToHost, musa_stream));
+    CHECK_MOORE(musaStreamSynchronize(musa_stream));
+
+    std::vector<float> result(output_size, 0.0f);
+    std::vector<float> temp1(input_size);
+    std::vector<float> temp2(input_size);
+    std::memcpy(temp1.data(), h_matrix.data(), input_bytes);
+
+    for (size_t i = 0; i < matrix_size; ++i) {
+        result[i * matrix_size + i] = 1.0f;
+    }
+
+    int power = static_cast<int>(n);
+    while (power > 0) {
+        if (power & 1) {
+            std::fill(temp2.begin(), temp2.end(), 0.0f);
+            for (size_t i = 0; i < matrix_size; ++i) {
+                for (size_t k = 0; k < matrix_size; ++k) {
+                    float val = result[i * matrix_size + k];
+                    for (size_t j = 0; j < matrix_size; ++j) {
+                        temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j];
+                    }
+                }
+            }
+            std::memcpy(result.data(), temp2.data(), output_size * sizeof(float));
+        }
+        std::fill(temp2.begin(), temp2.end(), 0.0f);
+        for (size_t i = 0; i < matrix_size; ++i) {
+            for (size_t k = 0; k < matrix_size; ++k) {
+                float val = temp1[i * matrix_size + k];
+                for (size_t j = 0; j < matrix_size; ++j) {
+                    temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j];
+                }
+            }
+        }
+        std::memcpy(temp1.data(), temp2.data(), input_bytes);
+        power >>= 1;
+    }
+
+    CHECK_MOORE(musaMemcpyAsync(y, result.data(), input_bytes, musaMemcpyHostToDevice, musa_stream));
+    CHECK_MOORE(musaStreamSynchronize(musa_stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::matrix_power::moore
diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu
new file mode 100644
index 000000000..d728238c1
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu
@@ -0,0 +1,546 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../tensor.h"
+#include "matrix_power_nvidia.cuh"
+#include <limits>
+#include <utility>
+
+namespace op::matrix_power::nvidia {
+
+namespace {
+
+template <typename T>
+__forceinline__ __device__ T identityZero();
+
+template <typename T>
+__forceinline__ __device__ T identityOne();
+
+template <>
+__forceinline__ __device__ __half identityZero<__half>() {
+    return __float2half(0.0f);
+}
+
+template <>
+__forceinline__ __device__ __half identityOne<__half>() {
+    return __float2half(1.0f);
+}
+
+template <>
+__forceinline__ __device__ cuda_bfloat16 identityZero<cuda_bfloat16>() {
+    return __float2bfloat16(0.0f);
+}
+
+template <>
+__forceinline__ __device__ cuda_bfloat16 identityOne<cuda_bfloat16>() {
+    return __float2bfloat16(1.0f);
+}
+
+template <>
+__forceinline__ __device__ float identityZero<float>() {
+    return 0.0f;
+}
+
+template <>
+__forceinline__ __device__ float identityOne<float>() {
+    return 1.0f;
+}
+
+template <>
+__forceinline__ __device__ double identityZero<double>() {
+    return 0.0;
+}
+
+template <>
+__forceinline__ __device__ double identityOne<double>() {
+    return 1.0;
+}
+
+template <typename T>
+INFINIOP_CUDA_KERNEL packMatrix2dStridedToContiguous(
+    const T *src,
+    T *dst,
+    size_t matrix_size,
+    ptrdiff_t src_stride_0,
+    ptrdiff_t src_stride_1) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t matrix_numel = matrix_size * matrix_size;
+    if (idx < matrix_numel) {
+        size_t row = idx / matrix_size;
+        size_t col = idx - row * matrix_size;
+        dst[idx] = src[row * src_stride_0 + col * src_stride_1];
+    }
+}
+
+template <typename T>
+INFINIOP_CUDA_KERNEL scatterMatrix2dContiguousToStrided(
+    const T *src,
+    T *dst,
+    size_t matrix_size,
+    ptrdiff_t dst_stride_0,
+    ptrdiff_t dst_stride_1) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t matrix_numel = matrix_size * matrix_size;
+    if (idx < matrix_numel) {
+        size_t row = idx / matrix_size;
+        size_t col = idx - row * matrix_size;
+        dst[row * dst_stride_0 + col * dst_stride_1] = src[idx];
+    }
+}
+
+template <typename T>
+INFINIOP_CUDA_KERNEL setIdentity2dStrided(
+    T *out,
+    size_t matrix_size,
+    ptrdiff_t out_stride_0,
+    ptrdiff_t out_stride_1) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t matrix_numel = matrix_size * matrix_size;
+    if (idx < matrix_numel) {
+        size_t row = idx / matrix_size;
+        size_t col = idx - row * matrix_size;
+        out[row * out_stride_0 + col * out_stride_1] = (row == col) ? identityOne<T>() : identityZero<T>();
+    }
+}
+
+INFINIOP_CUDA_KERNEL setDiagonalFp16(__half *out, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        out[idx * n + idx] = __float2half(1.0f);
+    }
+}
+
+INFINIOP_CUDA_KERNEL setDiagonalBf16(cuda_bfloat16 *out, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        out[idx * n + idx] = __float2bfloat16(1.0f);
+    }
+}
+
+INFINIOP_CUDA_KERNEL setDiagonalFp32(float *out, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        out[idx * n + idx] = 1.0f;
+    }
+}
+
+INFINIOP_CUDA_KERNEL setDiagonalFp64(double *out, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        out[idx * n + idx] = 1.0;
+    }
+}
+
+infiniStatus_t initializeIdentity(
+    void *y,
+    infiniDtype_t dtype,
+    size_t matrix_size,
+    bool y_contiguous,
+    ptrdiff_t y_stride_0,
+    ptrdiff_t y_stride_1,
+    cudaStream_t stream) {
+
+    if (matrix_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    constexpr int threads = 256;
+    size_t diag_blocks = CEIL_DIV(matrix_size, static_cast<size_t>(threads));
+    size_t matrix_numel = matrix_size * matrix_size;
+    size_t matrix_blocks = CEIL_DIV(matrix_numel, static_cast<size_t>(threads));
+
+    if (y_contiguous) {
+        CHECK_CUDA(cudaMemsetAsync(y, 0, matrix_numel * infiniSizeOf(dtype), stream));
+    }
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        if (y_contiguous) {
+            setDiagonalFp16<<<static_cast<unsigned int>(diag_blocks), threads, 0, stream>>>(
+                reinterpret_cast<__half *>(y), matrix_size);
+        } else {
+            setIdentity2dStrided<<<static_cast<unsigned int>(matrix_blocks), threads, 0, stream>>>(
+                reinterpret_cast<__half *>(y), matrix_size, y_stride_0, y_stride_1);
+        }
+        break;
+    case INFINI_DTYPE_BF16:
+        if (y_contiguous) {
+            setDiagonalBf16<<<static_cast<unsigned int>(diag_blocks), threads, 0, stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(y), matrix_size);
+        } else {
+            setIdentity2dStrided<<<static_cast<unsigned int>(matrix_blocks), threads, 0, stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(y), matrix_size, y_stride_0, y_stride_1);
+        }
+        break;
+    case INFINI_DTYPE_F32:
+        if (y_contiguous) {
+            setDiagonalFp32<<<static_cast<unsigned int>(diag_blocks), threads, 0, stream>>>(
+                reinterpret_cast<float *>(y), matrix_size);
+        } else {
+            setIdentity2dStrided<<<static_cast<unsigned int>(matrix_blocks), threads, 0, stream>>>(
+                reinterpret_cast<float *>(y), matrix_size, y_stride_0, y_stride_1);
+        }
+        break;
+    case INFINI_DTYPE_F64:
+        if (y_contiguous) {
+            setDiagonalFp64<<<static_cast<unsigned int>(diag_blocks), threads, 0, stream>>>(
+                reinterpret_cast<double *>(y), matrix_size);
+        } else {
+            setIdentity2dStrided<<<static_cast<unsigned int>(matrix_blocks), threads, 0, stream>>>(
+                reinterpret_cast<double *>(y), matrix_size, y_stride_0, y_stride_1);
+        }
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    CHECK_CUDA(cudaGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t packInputToContiguous(
+    void *dst,
+    const void *src,
+    infiniDtype_t dtype,
+    size_t matrix_size,
+    ptrdiff_t src_stride_0,
+    ptrdiff_t src_stride_1,
+    cudaStream_t stream) {
+    constexpr int threads = 256;
+    size_t matrix_numel = matrix_size * matrix_size;
+    size_t blocks = CEIL_DIV(matrix_numel, static_cast<size_t>(threads));
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        packMatrix2dStridedToContiguous<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const __half *>(src), reinterpret_cast<__half *>(dst),
+            matrix_size, src_stride_0, src_stride_1);
+        break;
+    case INFINI_DTYPE_BF16:
+        packMatrix2dStridedToContiguous<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const cuda_bfloat16 *>(src), reinterpret_cast<cuda_bfloat16 *>(dst),
+            matrix_size, src_stride_0, src_stride_1);
+        break;
+    case INFINI_DTYPE_F32:
+        packMatrix2dStridedToContiguous<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst),
+            matrix_size, src_stride_0, src_stride_1);
+        break;
+    case INFINI_DTYPE_F64:
+        packMatrix2dStridedToContiguous<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const double *>(src), reinterpret_cast<double *>(dst),
+            matrix_size, src_stride_0, src_stride_1);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    CHECK_CUDA(cudaGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t scatterContiguousToOutput(
+    void *dst,
+    const void *src,
+    infiniDtype_t dtype,
+    size_t matrix_size,
+    ptrdiff_t dst_stride_0,
+    ptrdiff_t dst_stride_1,
+    cudaStream_t stream) {
+    constexpr int threads = 256;
+    size_t matrix_numel = matrix_size * matrix_size;
+    size_t blocks = CEIL_DIV(matrix_numel, static_cast<size_t>(threads));
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        scatterMatrix2dContiguousToStrided<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const __half *>(src), reinterpret_cast<__half *>(dst),
+            matrix_size, dst_stride_0, dst_stride_1);
+        break;
+    case INFINI_DTYPE_BF16:
+        scatterMatrix2dContiguousToStrided<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const cuda_bfloat16 *>(src), reinterpret_cast<cuda_bfloat16 *>(dst),
+            matrix_size, dst_stride_0, dst_stride_1);
+        break;
+    case INFINI_DTYPE_F32:
+        scatterMatrix2dContiguousToStrided<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst),
+            matrix_size, dst_stride_0, dst_stride_1);
+        break;
+    case INFINI_DTYPE_F64:
+        scatterMatrix2dContiguousToStrided<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const double *>(src), reinterpret_cast<double *>(dst),
+            matrix_size, dst_stride_0, dst_stride_1);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    CHECK_CUDA(cudaGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+using GemmComputeType = cudaDataType;
+#else
+using GemmComputeType = cublasComputeType_t;
+#endif
+
+struct GemmTypeConfig {
+    cudaDataType io_type;
+    GemmComputeType compute_type;
+};
+
+infiniStatus_t getGemmTypeConfig(infiniDtype_t dtype, GemmTypeConfig &cfg) {
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        cfg.io_type = CUDA_R_16F;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        cfg.compute_type = CUDA_R_32F;
+#else
+        cfg.compute_type = CUBLAS_COMPUTE_32F;
+#endif
+        return INFINI_STATUS_SUCCESS;
+    case INFINI_DTYPE_BF16:
+        cfg.io_type = CUDA_R_16BF;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        cfg.compute_type = CUDA_R_32F;
+#else
+        cfg.compute_type = CUBLAS_COMPUTE_32F;
+#endif
+        return INFINI_STATUS_SUCCESS;
+    case INFINI_DTYPE_F32:
+        cfg.io_type = CUDA_R_32F;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        cfg.compute_type = CUDA_R_32F;
+#else
+        cfg.compute_type = CUBLAS_COMPUTE_32F;
+#endif
+        return INFINI_STATUS_SUCCESS;
+    case INFINI_DTYPE_F64:
+        cfg.io_type = CUDA_R_64F;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        cfg.compute_type = CUDA_R_64F;
+#else
+        cfg.compute_type = CUBLAS_COMPUTE_64F;
+#endif
+        return INFINI_STATUS_SUCCESS;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+// Compute row-major C = A * B using cuBLAS column-major GEMM:
+// C_col = B_col * A_col, where *_col views the same memory as column-major.
+infiniStatus_t gemmRowMajorSquare(
+    cublasHandle_t handle,
+    const GemmTypeConfig &cfg,
+    infiniDtype_t dtype,
+    int n,
+    const void *a,
+    const void *b,
+    void *c) {
+
+    if (dtype == INFINI_DTYPE_F64) {
+        const double alpha = 1.0;
+        const double beta = 0.0;
+        CHECK_CUBLAS(cublasGemmEx(
+            handle,
+            CUBLAS_OP_N,
+            CUBLAS_OP_N,
+            n,
+            n,
+            n,
+            &alpha,
+            b,
+            cfg.io_type,
+            n,
+            a,
+            cfg.io_type,
+            n,
+            &beta,
+            c,
+            cfg.io_type,
+            n,
+            cfg.compute_type,
+            CUBLAS_GEMM_DEFAULT));
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    CHECK_CUBLAS(cublasGemmEx(
+        handle,
+        CUBLAS_OP_N,
+        CUBLAS_OP_N,
+        n,
+        n,
+        n,
+        &alpha,
+        b,
+        cfg.io_type,
+        n,
+        a,
+        cfg.io_type,
+        n,
+        &beta,
+        c,
+        cfg.io_type,
+        n,
+        cfg.compute_type,
+        CUBLAS_GEMM_DEFAULT));
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_)
+        : internal(internal_) {}
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+    if (handle == nullptr || desc_ptr == nullptr || y_desc == nullptr || x_desc == nullptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    if (n < 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    if (y_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (y_shape != x_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x_desc->hasBroadcastDim() || y_desc->hasBroadcastDim()) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (x_shape[0] > static_cast<size_t>(std::numeric_limits<int>::max())) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_strides = x_desc->strides();
+    auto y_strides = y_desc->strides();
+    bool x_contiguous = x_desc->isContiguous();
+    bool y_contiguous = y_desc->isContiguous();
+
+    size_t matrix_numel = x_desc->numel();
+    size_t matrix_bytes = matrix_numel * infiniSizeOf(dtype);
+    size_t workspace_size = 0;
+    if (n != 0) {
+        workspace_size = matrix_bytes * (y_contiguous ? 2 : 3);
+    }
+
+    auto handle_nvidia = reinterpret_cast<device::nvidia::Handle *>(handle);
+    Descriptor *desc = new Descriptor(dtype, x_shape[0], static_cast<size_t>(n),
+                                      matrix_numel, y_desc->numel(), workspace_size,
+                                      x_strides[0], x_strides[1], y_strides[0], y_strides[1],
+                                      x_contiguous, y_contiguous,
+                                      handle->device, handle->device_id);
+    desc->_opaque = new Opaque(handle_nvidia->internal());
+    *desc_ptr = desc;
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (x == nullptr || y == nullptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    if (workspace_size < this->workspaceSize()) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    if (this->workspaceSize() != 0 && workspace == nullptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    if (matrix_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    if (n == 0) {
+        CHECK_STATUS(initializeIdentity(
+            y, _dtype, matrix_size, y_contiguous, y_stride_0, y_stride_1, cuda_stream));
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    size_t matrix_bytes = input_size * infiniSizeOf(_dtype);
+    char *workspace_ptr = reinterpret_cast<char *>(workspace);
+    void *base = workspace_ptr;
+    void *temp = workspace_ptr + matrix_bytes;
+    void *contiguous_output = y_contiguous ? y : (workspace_ptr + matrix_bytes * 2);
+
+    CHECK_STATUS(initializeIdentity(
+        contiguous_output, _dtype, matrix_size, true, 0, 0, cuda_stream));
+
+    if (x_contiguous) {
+        CHECK_CUDA(cudaMemcpyAsync(base, x, matrix_bytes, cudaMemcpyDeviceToDevice, cuda_stream));
+    } else {
+        CHECK_STATUS(packInputToContiguous(
+            base, x, _dtype, matrix_size, x_stride_0, x_stride_1, cuda_stream));
+    }
+
+    GemmTypeConfig cfg;
+    CHECK_STATUS(getGemmTypeConfig(_dtype, cfg));
+
+    void *result = contiguous_output;
+    void *scratch = temp;
+    void *base_matrix = base;
+    size_t power = n;
+    int matrix_dim = static_cast<int>(matrix_size);
+
+    CHECK_STATUS(_opaque->internal->useCublas(
+        cuda_stream,
+        [&](cublasHandle_t handle) {
+            while (power > 0) {
+                if (power & 1) {
+                    CHECK_STATUS(gemmRowMajorSquare(handle, cfg, _dtype, matrix_dim, result, base_matrix, scratch));
+                    std::swap(result, scratch);
+                }
+                power >>= 1;
+                if (power == 0) {
+                    break;
+                }
+                CHECK_STATUS(gemmRowMajorSquare(handle, cfg, _dtype, matrix_dim, base_matrix, base_matrix, scratch));
+                std::swap(base_matrix, scratch);
+            }
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    if (y_contiguous) {
+        if (result != y) {
+            CHECK_CUDA(cudaMemcpyAsync(y, result, matrix_bytes, cudaMemcpyDeviceToDevice, cuda_stream));
+        }
+    } else {
+        CHECK_STATUS(scatterContiguousToOutput(
+            y, result, _dtype, matrix_size, y_stride_0, y_stride_1, cuda_stream));
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::matrix_power::nvidia
diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh
new file mode 100644
index 000000000..d11c6914e
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh
@@ -0,0 +1,71 @@
+#ifndef __MATRIX_POWER_NVIDIA_H__
+#define __MATRIX_POWER_NVIDIA_H__
+
+#include "../../../operator.h"
+#include <cstddef>
+
+namespace op::matrix_power::nvidia {
+
+class Descriptor final : public InfiniopDescriptor {
+    struct Opaque;
+    Opaque *_opaque;
+    infiniDtype_t _dtype;
+    size_t matrix_size;
+    size_t n;
+    size_t input_size;
+    size_t output_size;
+    size_t workspace_size;
+    ptrdiff_t x_stride_0;
+    ptrdiff_t x_stride_1;
+    ptrdiff_t y_stride_0;
+    ptrdiff_t y_stride_1;
+    bool x_contiguous;
+    bool y_contiguous;
+
+    Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n,
+               size_t input_size, size_t output_size,
+               size_t workspace_size,
+               ptrdiff_t x_stride_0, ptrdiff_t x_stride_1,
+               ptrdiff_t y_stride_0, ptrdiff_t y_stride_1,
+               bool x_contiguous, bool y_contiguous,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _opaque(nullptr),
+          _dtype(dtype),
+          matrix_size(matrix_size),
+          n(n),
+          input_size(input_size),
+          output_size(output_size),
+          workspace_size(workspace_size),
+          x_stride_0(x_stride_0),
+          x_stride_1(x_stride_1),
+          y_stride_0(y_stride_0),
+          y_stride_1(y_stride_1),
+          x_contiguous(x_contiguous),
+          y_contiguous(y_contiguous) {}
+
+public:
+    ~Descriptor();
+
+    friend struct Opaque;
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int n);
+
+    size_t workspaceSize() const { return workspace_size; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::matrix_power::nvidia
+
+#endif // __MATRIX_POWER_NVIDIA_H__
diff --git a/src/infiniop/ops/matrix_power/operator.cc b/src/infiniop/ops/matrix_power/operator.cc
new file mode 100644
index 000000000..79a81077a
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/operator.cc
@@ -0,0 +1,159 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/matrix_power.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/matrix_power_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/matrix_power_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/matrix_power_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/matrix_power_moore.h"
+#endif
+
+__INFINI_C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(
+    infiniopHandle_t handle,
+    infiniopMatrixPowerDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+#define CREATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        return op::matrix_power::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                 \
+            reinterpret_cast<op::matrix_power::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                                 \
+            x_desc,                                                                 \
+            n)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::matrix_power::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C __export infiniStatus_t infiniopMatrixPower(
+    infiniopMatrixPowerDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                         \
+        return reinterpret_cast<const op::matrix_power::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C __export infiniStatus_t
+infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                          \
+        delete reinterpret_cast<const op::matrix_power::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc
new file mode 100644
index 000000000..15d0d6906
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc
@@ -0,0 +1,137 @@
+#include "pixel_shuffle_cpu.h"
+#include "../../../tensor.h"
+
+namespace op::pixel_shuffle::cpu {
+
+utils::Result<PixelShuffleInfo> PixelShuffleInfo::create(
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    int upscale_factor) {
+
+    if (upscale_factor <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 4) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t batch = x_shape[0];
+    size_t in_channels = x_shape[1];
+    size_t height = x_shape[2];
+    size_t width = x_shape[3];
+
+    // Input: (N, C*r^2, H, W) -> Output: (N, C, H*r, W*r)
+    if (in_channels % (upscale_factor * upscale_factor) != 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t out_channels = in_channels / (upscale_factor * upscale_factor);
+    size_t out_height = height * upscale_factor;
+    size_t out_width = width * upscale_factor;
+
+    std::vector<size_t> expected_y_shape = {batch, out_channels, out_height, out_width};
+    if (y_shape != expected_y_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    PixelShuffleInfo info;
+    info.batch = batch;
+    info.in_channels = in_channels;
+    info.out_channels = out_channels;
+    info.height = height;
+    info.width = width;
+    info.upscale_factor = upscale_factor;
+    info.input_size = x_desc->numel();
+    info.output_size = y_desc->numel();
+
+    return utils::Result<PixelShuffleInfo>(std::move(info));
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    auto info_result = PixelShuffleInfo::create(x_desc, y_desc, upscale_factor);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void pixel_shuffle_impl(
+    const PixelShuffleInfo &info,
+    T *y,
+    const T *x) {
+
+    int r = info.upscale_factor;
+
+    // Input: (N, C*r^2, H, W)
+    // Output: (N, C, H*r, W*r)
+    for (size_t n = 0; n < info.batch; ++n) {
+        for (size_t c = 0; c < info.out_channels; ++c) {
+            for (size_t h = 0; h < info.height; ++h) {
+                for (size_t w = 0; w < info.width; ++w) {
+                    for (int i = 0; i < r; ++i) {
+                        for (int j = 0; j < r; ++j) {
+                            // Input channel index
+                            size_t in_c = c * r * r + i * r + j;
+                            // Input position
+                            size_t in_idx = ((n * info.in_channels + in_c) * info.height + h) * info.width + w;
+                            // Output position
+                            size_t out_h = h * r + i;
+                            size_t out_w = w * r + j;
+                            size_t out_idx = ((n * info.out_channels + c) * (info.height * r) + out_h) * (info.width * r) + out_w;
+                            y[out_idx] = x[in_idx];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        pixel_shuffle_impl<fp16_t>(_info, reinterpret_cast<fp16_t *>(y), reinterpret_cast<const fp16_t *>(x));
+        break;
+    case INFINI_DTYPE_BF16:
+        pixel_shuffle_impl<bf16_t>(_info, reinterpret_cast<bf16_t *>(y), reinterpret_cast<const bf16_t *>(x));
+        break;
+    case INFINI_DTYPE_F32:
+        pixel_shuffle_impl<float>(_info, reinterpret_cast<float *>(y), reinterpret_cast<const float *>(x));
+        break;
+    case INFINI_DTYPE_F64:
+        pixel_shuffle_impl<double>(_info, reinterpret_cast<double *>(y), reinterpret_cast<const double *>(x));
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::pixel_shuffle::cpu
diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h
new file mode 100644
index 000000000..2fa5e00fe
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h
@@ -0,0 +1,58 @@
+#ifndef __PIXEL_SHUFFLE_CPU_H__
+#define __PIXEL_SHUFFLE_CPU_H__
+
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../operator.h"
+#include <vector>
+
+namespace op::pixel_shuffle::cpu {
+
+struct PixelShuffleInfo {
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t height;
+    size_t width;
+    int upscale_factor;
+    size_t input_size;
+    size_t output_size;
+
+    static utils::Result<PixelShuffleInfo> create(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        int upscale_factor);
+};
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    PixelShuffleInfo _info;
+
+    Descriptor(infiniDtype_t dtype, PixelShuffleInfo info,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int upscale_factor);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::pixel_shuffle::cpu
+
+#endif // __PIXEL_SHUFFLE_CPU_H__
diff --git a/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh
new file mode 100644
index 000000000..874b29cce
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh
@@ -0,0 +1,92 @@
+#pragma once
+#include <cstddef>
+#include <type_traits>
+
+namespace op::cuda {
+
+template <typename T>
+__global__ void pixel_shuffle_kernel(
+    T *output,
+    const T *input,
+    size_t batch,
+    size_t out_channels,
+    size_t height,
+    size_t width,
+    int r) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = batch * out_channels * height * width;
+
+    if (idx >= total) {
+        return;
+    }
+
+    size_t n = idx / (out_channels * height * width);
+    size_t rem = idx % (out_channels * height * width);
+    size_t c = rem / (height * width);
+    rem = rem % (height * width);
+    size_t oh = rem / width;
+    size_t ow = rem % width;
+
+    // Calculate input indices
+    size_t w = ow / r;
+    size_t h = oh / r;
+    size_t i = oh % r;
+    size_t j = ow % r;
+    size_t in_c = c * r * r + i * r + j;
+
+    size_t in_idx = ((n * (out_channels * r * r) + in_c) * (height / r) + h) * (width / r) + w;
+    size_t out_idx = ((n * out_channels + c) * height + oh) * width + ow;
+
+    output[out_idx] = input[in_idx];
+}
+
+template <typename T>
+__global__ void pixel_shuffle_kernel_strided(
+    T *output,
+    const T *input,
+    size_t batch,
+    size_t out_channels,
+    size_t out_height,
+    size_t out_width,
+    int r,
+    ptrdiff_t x_stride0,
+    ptrdiff_t x_stride1,
+    ptrdiff_t x_stride2,
+    ptrdiff_t x_stride3,
+    ptrdiff_t y_stride0,
+    ptrdiff_t y_stride1,
+    ptrdiff_t y_stride2,
+    ptrdiff_t y_stride3) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = batch * out_channels * out_height * out_width;
+
+    if (idx >= total) {
+        return;
+    }
+
+    const size_t spatial = out_height * out_width;
+    const size_t chw = out_channels * spatial;
+
+    size_t n = idx / chw;
+    size_t rem = idx % chw;
+    size_t c = rem / spatial;
+    rem = rem % spatial;
+    size_t oh = rem / out_width;
+    size_t ow = rem % out_width;
+
+    const size_t upscale = static_cast<size_t>(r);
+    const size_t ih = oh / upscale;
+    const size_t iw = ow / upscale;
+    const size_t i = oh % upscale;
+    const size_t j = ow % upscale;
+    const size_t in_c = c * upscale * upscale + i * upscale + j;
+
+    const ptrdiff_t in_offset = static_cast<ptrdiff_t>(n) * x_stride0 + static_cast<ptrdiff_t>(in_c) * x_stride1 + static_cast<ptrdiff_t>(ih) * x_stride2 + static_cast<ptrdiff_t>(iw) * x_stride3;
+    const ptrdiff_t out_offset = static_cast<ptrdiff_t>(n) * y_stride0 + static_cast<ptrdiff_t>(c) * y_stride1 + static_cast<ptrdiff_t>(oh) * y_stride2 + static_cast<ptrdiff_t>(ow) * y_stride3;
+
+    output[out_offset] = input[in_offset];
+}
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h
new file mode 100644
index 000000000..34f216655
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h
@@ -0,0 +1,56 @@
+#ifndef __PIXEL_SHUFFLE_METAX_H__
+#define __PIXEL_SHUFFLE_METAX_H__
+
+#include "../../../operator.h"
+
+namespace op::pixel_shuffle::metax {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t height;
+    size_t width;
+    int upscale_factor;
+    size_t input_size;
+    size_t output_size;
+
+    Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels,
+               size_t height, size_t width, int upscale_factor,
+               size_t input_size, size_t output_size,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          batch(batch),
+          in_channels(in_channels),
+          out_channels(out_channels),
+          height(height),
+          width(width),
+          upscale_factor(upscale_factor),
+          input_size(input_size),
+          output_size(output_size) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int upscale_factor);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::pixel_shuffle::metax
+
+#endif // __PIXEL_SHUFFLE_METAX_H__
diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca
new file mode 100644
index 000000000..3c8afd6f2
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca
@@ -0,0 +1,112 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+#include "pixel_shuffle_metax.h"
+
+namespace op::pixel_shuffle::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    if (upscale_factor <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 4) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t batch = x_shape[0];
+    size_t in_channels = x_shape[1];
+    size_t height = x_shape[2];
+    size_t width = x_shape[3];
+
+    if (in_channels % (upscale_factor * upscale_factor) != 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t out_channels = in_channels / (upscale_factor * upscale_factor);
+    size_t out_height = height * upscale_factor;
+    size_t out_width = width * upscale_factor;
+
+    std::vector<size_t> expected_y_shape = {batch, out_channels, out_height, out_width};
+    if (y_shape != expected_y_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels,
+                               height, width, upscale_factor,
+                               x_desc->numel(), y_desc->numel(),
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    constexpr int BLOCK_SIZE = 256;
+    size_t total = output_size;
+    if (total == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::pixel_shuffle_kernel<half><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<half *>(y),
+            reinterpret_cast<const half *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::pixel_shuffle_kernel<cuda_bfloat16><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<cuda_bfloat16 *>(y),
+            reinterpret_cast<const cuda_bfloat16 *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::pixel_shuffle_kernel<float><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::pixel_shuffle_kernel<double><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<double *>(y),
+            reinterpret_cast<const double *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::pixel_shuffle::metax
diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h
new file mode 100644
index 000000000..e47a3edb2
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h
@@ -0,0 +1,56 @@
+#ifndef __PIXEL_SHUFFLE_MOORE_H__
+#define __PIXEL_SHUFFLE_MOORE_H__
+
+#include "../../../operator.h"
+
+namespace op::pixel_shuffle::moore {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t height;
+    size_t width;
+    int upscale_factor;
+    size_t input_size;
+    size_t output_size;
+
+    Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels,
+               size_t height, size_t width, int upscale_factor,
+               size_t input_size, size_t output_size,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          batch(batch),
+          in_channels(in_channels),
+          out_channels(out_channels),
+          height(height),
+          width(width),
+          upscale_factor(upscale_factor),
+          input_size(input_size),
+          output_size(output_size) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int upscale_factor);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::pixel_shuffle::moore
+
+#endif // __PIXEL_SHUFFLE_MOORE_H__
diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu
new file mode 100644
index 000000000..5d2c113d6
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu
@@ -0,0 +1,112 @@
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+#include "pixel_shuffle_moore.h"
+
+namespace op::pixel_shuffle::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    if (upscale_factor <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 4) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t batch = x_shape[0];
+    size_t in_channels = x_shape[1];
+    size_t height = x_shape[2];
+    size_t width = x_shape[3];
+
+    if (in_channels % (upscale_factor * upscale_factor) != 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t out_channels = in_channels / (upscale_factor * upscale_factor);
+    size_t out_height = height * upscale_factor;
+    size_t out_width = width * upscale_factor;
+
+    std::vector<size_t> expected_y_shape = {batch, out_channels, out_height, out_width};
+    if (y_shape != expected_y_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels,
+                               height, width, upscale_factor,
+                               x_desc->numel(), y_desc->numel(),
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    constexpr int BLOCK_SIZE = 256;
+    size_t total = output_size;
+    if (total == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::pixel_shuffle_kernel<half><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<half *>(y),
+            reinterpret_cast<const half *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::pixel_shuffle_kernel<cuda_bfloat16><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<cuda_bfloat16 *>(y),
+            reinterpret_cast<const cuda_bfloat16 *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::pixel_shuffle_kernel<float><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::pixel_shuffle_kernel<double><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<double *>(y),
+            reinterpret_cast<const double *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::pixel_shuffle::moore
diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu
new file mode 100644
index 000000000..af2d006ae
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu
@@ -0,0 +1,129 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+#include "pixel_shuffle_nvidia.cuh"
+#include <array>
+
+namespace op::pixel_shuffle::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    if (y_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (upscale_factor <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 4 || y_shape.size() != 4) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x_desc->hasBroadcastDim() || y_desc->hasBroadcastDim()) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    size_t batch = x_shape[0];
+    size_t in_channels = x_shape[1];
+    size_t height = x_shape[2];
+    size_t width = x_shape[3];
+
+    if (in_channels % (upscale_factor * upscale_factor) != 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t out_channels = in_channels / (upscale_factor * upscale_factor);
+    size_t out_height = height * upscale_factor;
+    size_t out_width = width * upscale_factor;
+
+    std::vector<size_t> expected_y_shape = {batch, out_channels, out_height, out_width};
+    if (y_shape != expected_y_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    std::array<ptrdiff_t, 4> x_strides = {x_desc->stride(0), x_desc->stride(1), x_desc->stride(2), x_desc->stride(3)};
+    std::array<ptrdiff_t, 4> y_strides = {y_desc->stride(0), y_desc->stride(1), y_desc->stride(2), y_desc->stride(3)};
+
+    *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels,
+                               height, width, upscale_factor,
+                               x_desc->numel(), y_desc->numel(),
+                               x_strides, y_strides,
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    constexpr int BLOCK_SIZE = 256;
+    const size_t out_height = height * static_cast<size_t>(upscale_factor);
+    const size_t out_width = width * static_cast<size_t>(upscale_factor);
+    const size_t total = output_size;
+    if (total == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::pixel_shuffle_kernel_strided<half><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<half *>(y),
+            reinterpret_cast<const half *>(x),
+            batch, out_channels, out_height, out_width,
+            upscale_factor,
+            x_strides[0], x_strides[1], x_strides[2], x_strides[3],
+            y_strides[0], y_strides[1], y_strides[2], y_strides[3]);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::pixel_shuffle_kernel_strided<__nv_bfloat16><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<__nv_bfloat16 *>(y),
+            reinterpret_cast<const __nv_bfloat16 *>(x),
+            batch, out_channels, out_height, out_width,
+            upscale_factor,
+            x_strides[0], x_strides[1], x_strides[2], x_strides[3],
+            y_strides[0], y_strides[1], y_strides[2], y_strides[3]);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::pixel_shuffle_kernel_strided<float><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            batch, out_channels, out_height, out_width,
+            upscale_factor,
+            x_strides[0], x_strides[1], x_strides[2], x_strides[3],
+            y_strides[0], y_strides[1], y_strides[2], y_strides[3]);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::pixel_shuffle_kernel_strided<double><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<double *>(y),
+            reinterpret_cast<const double *>(x),
+            batch, out_channels, out_height, out_width,
+            upscale_factor,
+            x_strides[0], x_strides[1], x_strides[2], x_strides[3],
+            y_strides[0], y_strides[1], y_strides[2], y_strides[3]);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::pixel_shuffle::nvidia
diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh
new file mode 100644
index 000000000..a5c3b7506
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh
@@ -0,0 +1,64 @@
+#ifndef __PIXEL_SHUFFLE_NVIDIA_H__
+#define __PIXEL_SHUFFLE_NVIDIA_H__
+
+#include "../../../operator.h"
+#include <array>
+#include <cstddef>
+
+namespace op::pixel_shuffle::nvidia {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t height;
+    size_t width;
+    int upscale_factor;
+    size_t input_size;
+    size_t output_size;
+    std::array<ptrdiff_t, 4> x_strides;
+    std::array<ptrdiff_t, 4> y_strides;
+
+    Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels,
+               size_t height, size_t width, int upscale_factor,
+               size_t input_size, size_t output_size,
+               std::array<ptrdiff_t, 4> x_strides,
+               std::array<ptrdiff_t, 4> y_strides,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          batch(batch),
+          in_channels(in_channels),
+          out_channels(out_channels),
+          height(height),
+          width(width),
+          upscale_factor(upscale_factor),
+          input_size(input_size),
+          output_size(output_size),
+          x_strides(x_strides),
+          y_strides(y_strides) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int upscale_factor);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::pixel_shuffle::nvidia
+
+#endif // __PIXEL_SHUFFLE_NVIDIA_H__
diff --git a/src/infiniop/ops/pixel_shuffle/operator.cc b/src/infiniop/ops/pixel_shuffle/operator.cc
new file mode 100644
index 000000000..3092787ad
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/operator.cc
@@ -0,0 +1,159 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/pixel_shuffle.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/pixel_shuffle_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/pixel_shuffle_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/pixel_shuffle_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/pixel_shuffle_moore.h"
+#endif
+
+__INFINI_C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(
+    infiniopHandle_t handle,
+    infiniopPixelShuffleDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::pixel_shuffle::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::pixel_shuffle::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                                  \
+            x_desc,                                                                  \
+            upscale_factor)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__INFINI_C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<op::pixel_shuffle::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C __export infiniStatus_t infiniopPixelShuffle(
+    infiniopPixelShuffleDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::pixel_shuffle::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__INFINI_C __export infiniStatus_t
+infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::pixel_shuffle::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py
new file mode 100644
index 000000000..733376b97
--- /dev/null
+++ b/test/infiniop/erf.py
@@ -0,0 +1,118 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import *
+from enum import Enum, auto
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+_TEST_CASES_ = [
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None),
+]
+
+_TEST_CASES = _TEST_CASES_
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    input = TestTensor(shape, input_stride, dtype, device)
+    output = TestTensor(shape, output_stride, dtype, device)
+
+    output.update_torch_tensor(torch.erf(input.torch_tensor()))
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateErfDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetErfWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopErf(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.erf(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/erfc.py b/test/infiniop/erfc.py
new file mode 100644
index 000000000..b2424496d
--- /dev/null
+++ b/test/infiniop/erfc.py
@@ -0,0 +1,117 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import *
+from enum import Enum, auto
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+_TEST_CASES_ = [
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None),
+]
+
+_TEST_CASES = _TEST_CASES_
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Erfc on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    input = TestTensor(shape, input_stride, dtype, device)
+    output = TestTensor(shape, output_stride, dtype, device)
+
+    output.update_torch_tensor(torch.erfc(input.torch_tensor()))
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateErfcDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetErfcWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopErfc(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.erfc(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyErfcDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/erfinv.py b/test/infiniop/erfinv.py
new file mode 100644
index 000000000..1979ee9c1
--- /dev/null
+++ b/test/infiniop/erfinv.py
@@ -0,0 +1,118 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import *
+from enum import Enum, auto
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+_TEST_CASES_ = [
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None),
+]
+
+_TEST_CASES = _TEST_CASES_
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Erfinv on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    input = TestTensor(shape, input_stride, dtype, device, scale=1.8, bias=-0.9)
+
+    output = TestTensor(shape, output_stride, dtype, device)
+    output.update_torch_tensor(torch.erfinv(input.torch_tensor()))
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateErfinvDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetErfinvWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopErfinv(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.erfinv(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyErfinvDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index cd3ad1b82..3675ca8d1 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -1753,3 +1753,165 @@ def silu_and_mul(lib):
     lib.infiniopDestroySiluAndMulDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+
+@OpRegister.operator
+def erf(lib):
+    lib.infiniopCreateErfDescriptor.restype = c_int32
+    lib.infiniopCreateErfDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetErfWorkspaceSize.restype = c_int32
+    lib.infiniopGetErfWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopErf.restype = c_int32
+    lib.infiniopErf.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyErfDescriptor.restype = c_int32
+    lib.infiniopDestroyErfDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def erfc(lib):
+    lib.infiniopCreateErfcDescriptor.restype = c_int32
+    lib.infiniopCreateErfcDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetErfcWorkspaceSize.restype = c_int32
+    lib.infiniopGetErfcWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopErfc.restype = c_int32
+    lib.infiniopErfc.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyErfcDescriptor.restype = c_int32
+    lib.infiniopDestroyErfcDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def erfinv(lib):
+    lib.infiniopCreateErfinvDescriptor.restype = c_int32
+    lib.infiniopCreateErfinvDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetErfinvWorkspaceSize.restype = c_int32
+    lib.infiniopGetErfinvWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopErfinv.restype = c_int32
+    lib.infiniopErfinv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyErfinvDescriptor.restype = c_int32
+    lib.infiniopDestroyErfinvDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def matrix_power(lib):
+    lib.infiniopCreateMatrixPowerDescriptor.restype = c_int32
+    lib.infiniopCreateMatrixPowerDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+
+    lib.infiniopGetMatrixPowerWorkspaceSize.restype = c_int32
+    lib.infiniopGetMatrixPowerWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMatrixPower.restype = c_int32
+    lib.infiniopMatrixPower.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMatrixPowerDescriptor.restype = c_int32
+    lib.infiniopDestroyMatrixPowerDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def pixel_shuffle(lib):
+    lib.infiniopCreatePixelShuffleDescriptor.restype = c_int32
+    lib.infiniopCreatePixelShuffleDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+
+    lib.infiniopGetPixelShuffleWorkspaceSize.restype = c_int32
+    lib.infiniopGetPixelShuffleWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPixelShuffle.restype = c_int32
+    lib.infiniopPixelShuffle.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPixelShuffleDescriptor.restype = c_int32
+    lib.infiniopDestroyPixelShuffleDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/matrix_power.py b/test/infiniop/matrix_power.py
new file mode 100644
index 000000000..7527fe249
--- /dev/null
+++ b/test/infiniop/matrix_power.py
@@ -0,0 +1,104 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+_TEST_CASES_ = [
+    ((4, 4), None, None),
+    ((8, 8), None, None),
+]
+
+_POWERS = [2, 3, 4]
+
+_TEST_CASES = [test_case + (n,) for test_case in _TEST_CASES_ for n in _POWERS]
+
+_TENSOR_DTYPES = [InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(handle, device, shape, input_stride, output_stride, n, dtype, sync=None):
+    print(
+        f"Testing MatrixPower on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} power:{n} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    if shape[-1] != shape[-2]:
+        return
+
+    input = TestTensor(shape, input_stride, dtype, device)
+    output = TestTensor(shape, output_stride, dtype, device)
+
+    output.update_torch_tensor(torch.linalg.matrix_power(input.torch_tensor(), n))
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMatrixPowerDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor, n
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    ws = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMatrixPowerWorkspaceSize(descriptor, ctypes.byref(ws))
+    )
+    workspace = TestWorkspace(ws.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopMatrixPower(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.linalg.matrix_power(input.torch_tensor(), n), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyMatrixPowerDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/pixel_shuffle.py b/test/infiniop/pixel_shuffle.py
new file mode 100644
index 000000000..2fb690fe5
--- /dev/null
+++ b/test/infiniop/pixel_shuffle.py
@@ -0,0 +1,114 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+
+_TEST_CASES = [
+    ((1, 4, 8, 8), None, None, 2),
+    ((2, 16, 4, 4), None, None, 2),
+    ((2, 16, 4, 4), None, None, 4),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(handle, device, shape, input_stride, output_stride, upscale, dtype, sync=None):
+    print(
+        f"Testing PixelShuffle on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} upscale:{upscale} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    r = upscale
+    assert shape[1] % (r * r) == 0
+
+    input = TestTensor(shape, input_stride, dtype, device)
+
+    output_shape = (
+        shape[0],
+        shape[1] // (r * r),
+        shape[2] * r,
+        shape[3] * r,
+    )
+
+    output = TestTensor(output_shape, output_stride, dtype, device)
+
+    output.update_torch_tensor(
+        torch.nn.functional.pixel_shuffle(input.torch_tensor(), r)
+    )
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePixelShuffleDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor, r
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    ws = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPixelShuffleWorkspaceSize(descriptor, ctypes.byref(ws))
+    )
+    workspace = TestWorkspace(ws.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopPixelShuffle(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.nn.functional.pixel_shuffle(input.torch_tensor(), r), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyPixelShuffleDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mTest passed!\033[0m")