From 6431be37f3522c74a193c5617dcd6602b4528c97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B5=96=E6=B3=89laiquan?=
 <167113918+LaiQuan-conquer@users.noreply.github.com>
Date: Tue, 7 Apr 2026 10:45:01 +0800
Subject: [PATCH 1/2] issue/1031 merge T1-1-41

---
 include/infiniop.h                            |   5 +
 include/infiniop/ops/erf.h                    |  24 +
 include/infiniop/ops/erfc.h                   |  24 +
 include/infiniop/ops/erfinv.h                 |  24 +
 include/infiniop/ops/matrix_power.h           |  25 +
 include/infiniop/ops/pixel_shuffle.h          |  25 +
 src/infiniop/ops/erf/cpu/erf_cpu.cc           |  52 ++
 src/infiniop/ops/erf/cpu/erf_cpu.h            |  20 +
 src/infiniop/ops/erf/cuda/kernel.cuh          |  36 ++
 src/infiniop/ops/erf/erf.h                    |   8 +
 src/infiniop/ops/erf/metax/erf_metax.h        |   8 +
 src/infiniop/ops/erf/metax/erf_metax.maca     |  60 ++
 src/infiniop/ops/erf/moore/erf_moore.h        |   8 +
 src/infiniop/ops/erf/moore/erf_moore.mu       |  60 ++
 src/infiniop/ops/erf/moore/erf_moore_kernel.h |  36 ++
 src/infiniop/ops/erf/nvidia/erf_nvidia.cu     |  58 ++
 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh    |   8 +
 src/infiniop/ops/erf/operator.cc              | 157 +++++
 src/infiniop/ops/erfc/cpu/erfc_cpu.cc         |  52 ++
 src/infiniop/ops/erfc/cpu/erfc_cpu.h          |  20 +
 src/infiniop/ops/erfc/cuda/kernel.cuh         |  36 ++
 src/infiniop/ops/erfc/erfc.h                  |   8 +
 src/infiniop/ops/erfc/metax/erfc_metax.h      |   8 +
 src/infiniop/ops/erfc/metax/erfc_metax.maca   |  60 ++
 src/infiniop/ops/erfc/moore/erfc_moore.h      |   8 +
 src/infiniop/ops/erfc/moore/erfc_moore.mu     |  60 ++
 .../ops/erfc/moore/erfc_moore_kernel.h        |  36 ++
 src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu   |  58 ++
 src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh  |   8 +
 src/infiniop/ops/erfc/operator.cc             | 157 +++++
 src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc     |  52 ++
 src/infiniop/ops/erfinv/cpu/erfinv_cpu.h      |  46 ++
 src/infiniop/ops/erfinv/cuda/kernel.cuh       | 113 ++++
 src/infiniop/ops/erfinv/erfinv.h              |   8 +
 src/infiniop/ops/erfinv/metax/erfinv_metax.h  |   8 +
 .../ops/erfinv/metax/erfinv_metax.maca        |  60 ++
 src/infiniop/ops/erfinv/moore/erfinv_moore.h  |   8 +
 src/infiniop/ops/erfinv/moore/erfinv_moore.mu |  60 ++
 .../ops/erfinv/moore/erfinv_moore_kernel.h    |  72 +++
 .../ops/erfinv/nvidia/erfinv_nvidia.cu        |  58 ++
 .../ops/erfinv/nvidia/erfinv_nvidia.cuh       |   8 +
 src/infiniop/ops/erfinv/operator.cc           | 157 +++++
 .../ops/matrix_power/cpu/matrix_power_cpu.cc  | 201 +++++++
 .../ops/matrix_power/cpu/matrix_power_cpu.h   |  69 +++
 .../matrix_power/metax/matrix_power_metax.h   |  48 ++
 .../metax/matrix_power_metax.maca             | 107 ++++
 .../matrix_power/moore/matrix_power_moore.h   |  48 ++
 .../matrix_power/moore/matrix_power_moore.mu  | 106 ++++
 .../nvidia/matrix_power_nvidia.cu             | 548 ++++++++++++++++++
 .../nvidia/matrix_power_nvidia.cuh            |  72 +++
 src/infiniop/ops/matrix_power/operator.cc     | 159 +++++
 .../pixel_shuffle/cpu/pixel_shuffle_cpu.cc    | 137 +++++
 .../ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h |  58 ++
 .../ops/pixel_shuffle/cuda/kernel.cuh         |  95 +++
 .../pixel_shuffle/metax/pixel_shuffle_metax.h |  57 ++
 .../metax/pixel_shuffle_metax.maca            | 113 ++++
 .../pixel_shuffle/moore/pixel_shuffle_moore.h |  57 ++
 .../moore/pixel_shuffle_moore.mu              | 113 ++++
 .../nvidia/pixel_shuffle_nvidia.cu            | 129 +++++
 .../nvidia/pixel_shuffle_nvidia.cuh           |  65 +++
 src/infiniop/ops/pixel_shuffle/operator.cc    | 159 +++++
 61 files changed, 4140 insertions(+)
 create mode 100644 include/infiniop/ops/erf.h
 create mode 100644 include/infiniop/ops/erfc.h
 create mode 100644 include/infiniop/ops/erfinv.h
 create mode 100644 include/infiniop/ops/matrix_power.h
 create mode 100644 include/infiniop/ops/pixel_shuffle.h
 create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.cc
 create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.h
 create mode 100644 src/infiniop/ops/erf/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/erf/erf.h
 create mode 100644 src/infiniop/ops/erf/metax/erf_metax.h
 create mode 100644 src/infiniop/ops/erf/metax/erf_metax.maca
 create mode 100644 src/infiniop/ops/erf/moore/erf_moore.h
 create mode 100644 src/infiniop/ops/erf/moore/erf_moore.mu
 create mode 100644 src/infiniop/ops/erf/moore/erf_moore_kernel.h
 create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cu
 create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
 create mode 100644 src/infiniop/ops/erf/operator.cc
 create mode 100644 src/infiniop/ops/erfc/cpu/erfc_cpu.cc
 create mode 100644 src/infiniop/ops/erfc/cpu/erfc_cpu.h
 create mode 100644 src/infiniop/ops/erfc/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/erfc/erfc.h
 create mode 100644 src/infiniop/ops/erfc/metax/erfc_metax.h
 create mode 100644 src/infiniop/ops/erfc/metax/erfc_metax.maca
 create mode 100644 src/infiniop/ops/erfc/moore/erfc_moore.h
 create mode 100644 src/infiniop/ops/erfc/moore/erfc_moore.mu
 create mode 100644 src/infiniop/ops/erfc/moore/erfc_moore_kernel.h
 create mode 100644 src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu
 create mode 100644 src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh
 create mode 100644 src/infiniop/ops/erfc/operator.cc
 create mode 100644 src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc
 create mode 100644 src/infiniop/ops/erfinv/cpu/erfinv_cpu.h
 create mode 100644 src/infiniop/ops/erfinv/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/erfinv/erfinv.h
 create mode 100644 src/infiniop/ops/erfinv/metax/erfinv_metax.h
 create mode 100644 src/infiniop/ops/erfinv/metax/erfinv_metax.maca
 create mode 100644 src/infiniop/ops/erfinv/moore/erfinv_moore.h
 create mode 100644 src/infiniop/ops/erfinv/moore/erfinv_moore.mu
 create mode 100644 src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h
 create mode 100644 src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu
 create mode 100644 src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh
 create mode 100644 src/infiniop/ops/erfinv/operator.cc
 create mode 100644 src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc
 create mode 100644 src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h
 create mode 100644 src/infiniop/ops/matrix_power/metax/matrix_power_metax.h
 create mode 100644 src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca
 create mode 100644 src/infiniop/ops/matrix_power/moore/matrix_power_moore.h
 create mode 100644 src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu
 create mode 100644 src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu
 create mode 100644 src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh
 create mode 100644 src/infiniop/ops/matrix_power/operator.cc
 create mode 100644 src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc
 create mode 100644 src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h
 create mode 100644 src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h
 create mode 100644 src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca
 create mode 100644 src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h
 create mode 100644 src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu
 create mode 100644 src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu
 create mode 100644 src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh
 create mode 100644 src/infiniop/ops/pixel_shuffle/operator.cc

diff --git a/include/infiniop.h b/include/infiniop.h
index fe9537876..104476037 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -30,6 +30,9 @@
 #include "infiniop/ops/dequantize_gptq.h"
 #include "infiniop/ops/embedding.h"
 #include "infiniop/ops/equal.h"
+#include "infiniop/ops/erf.h"
+#include "infiniop/ops/erfc.h"
+#include "infiniop/ops/erfinv.h"
 #include "infiniop/ops/flash_attention.h"
 #include "infiniop/ops/flipud.h"
 #include "infiniop/ops/float_power.h"
@@ -58,12 +61,14 @@
 #include "infiniop/ops/logcumsumexp.h"
 #include "infiniop/ops/lp_norm.h"
 #include "infiniop/ops/masked_select.h"
+#include "infiniop/ops/matrix_power.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/multi_margin_loss.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
 #include "infiniop/ops/paged_attention_prefill.h"
 #include "infiniop/ops/paged_caching.h"
+#include "infiniop/ops/pixel_shuffle.h"
 #include "infiniop/ops/quant/per_channel_quant_int8.h"
 #include "infiniop/ops/quant/per_tensor_quant_int8.h"
 #include "infiniop/ops/random_sample.h"
diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h
new file mode 100644
index 000000000..8cbb8fb74
--- /dev/null
+++ b/include/infiniop/ops/erf.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ERF_API_H__
+#define __INFINIOP_ERF_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopErfDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
+                                                        infiniopErfDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/erfc.h b/include/infiniop/ops/erfc.h
new file mode 100644
index 000000000..6454573bc
--- /dev/null
+++ b/include/infiniop/ops/erfc.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ERFC_API_H__
+#define __INFINIOP_ERFC_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopErfcDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle,
+                                                         infiniopErfcDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y,
+                                                         infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *y,
+                                         const void *x,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/erfinv.h b/include/infiniop/ops/erfinv.h
new file mode 100644
index 000000000..79bc09f22
--- /dev/null
+++ b/include/infiniop/ops/erfinv.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ERFINV_API_H__
+#define __INFINIOP_ERFINV_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopErfinvDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle,
+                                                           infiniopErfinvDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y,
+                                                           infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *y,
+                                           const void *x,
+                                           void *stream);
+
+__C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/matrix_power.h b/include/infiniop/ops/matrix_power.h
new file mode 100644
index 000000000..acd7c0c7e
--- /dev/null
+++ b/include/infiniop/ops/matrix_power.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_MATRIX_POWER_API_H__
+#define __INFINIOP_MATRIX_POWER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMatrixPowerDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle,
+                                                                infiniopMatrixPowerDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x,
+                                                                int n);
+
+__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                const void *x,
+                                                void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/pixel_shuffle.h b/include/infiniop/ops/pixel_shuffle.h
new file mode 100644
index 000000000..941a91cfc
--- /dev/null
+++ b/include/infiniop/ops/pixel_shuffle.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_PIXEL_SHUFFLE_API_H__
+#define __INFINIOP_PIXEL_SHUFFLE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopPixelShuffleDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle,
+                                                                 infiniopPixelShuffleDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t y,
+                                                                 infiniopTensorDescriptor_t x,
+                                                                 int upscale_factor);
+
+__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *y,
+                                                 const void *x,
+                                                 void *stream);
+
+__C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc
new file mode 100644
index 000000000..7d127bfae
--- /dev/null
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc
@@ -0,0 +1,52 @@
+#include "erf_cpu.h"
+
+namespace op::erf::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ErfOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ErfOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ErfOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ErfOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erf::cpu
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h
new file mode 100644
index 000000000..74ad19e57
--- /dev/null
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __ERF_CPU_H__
+#define __ERF_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(erf, cpu)
+
+namespace op::erf::cpu {
+typedef struct ErfOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::erf(x);
+    }
+} ErfOp;
+} // namespace op::erf::cpu
+
+#endif // __ERF_CPU_H__
diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh
new file mode 100644
index 000000000..9bd6cff21
--- /dev/null
+++ b/src/infiniop/ops/erf/cuda/kernel.cuh
@@ -0,0 +1,36 @@
+#pragma once
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <type_traits>
+
+namespace op::cuda {
+
+struct ErfOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(T x) const {
+        if constexpr (std::is_same_v<T, float>) {
+            return erff(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return erf(x);
+        } else {
+            // For F16/BF16: promote to float, compute, then cast back
+            float xf;
+            if constexpr (std::is_same_v<T, half>) {
+                xf = __half2float(x);
+                return __float2half_rn(erff(xf));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                xf = __bfloat162float(x);
+                return __float2bfloat16_rn(erff(xf));
+            } else {
+                xf = static_cast<float>(x);
+                return static_cast<T>(erff(xf));
+            }
+        }
+    }
+};
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/erf/erf.h b/src/infiniop/ops/erf/erf.h
new file mode 100644
index 000000000..7c967dea2
--- /dev/null
+++ b/src/infiniop/ops/erf/erf.h
@@ -0,0 +1,8 @@
+#ifndef __ERF_H__
+#define __ERF_H__
+
+#include "../../elementwise/elementwise.h"
+
+#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erf, NAMESPACE)
+
+#endif // __ERF_H__
diff --git a/src/infiniop/ops/erf/metax/erf_metax.h b/src/infiniop/ops/erf/metax/erf_metax.h
new file mode 100644
index 000000000..5dfe23bbe
--- /dev/null
+++ b/src/infiniop/ops/erf/metax/erf_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ERF_METAX_API_H__
+#define __ERF_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erf, metax)
+
+#endif // __ERF_METAX_API_H__
diff --git a/src/infiniop/ops/erf/metax/erf_metax.maca b/src/infiniop/ops/erf/metax/erf_metax.maca
new file mode 100644
index 000000000..fcf956af2
--- /dev/null
+++ b/src/infiniop/ops/erf/metax/erf_metax.maca
@@ -0,0 +1,60 @@
+#include "erf_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erf::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erf::metax
diff --git a/src/infiniop/ops/erf/moore/erf_moore.h b/src/infiniop/ops/erf/moore/erf_moore.h
new file mode 100644
index 000000000..620055688
--- /dev/null
+++ b/src/infiniop/ops/erf/moore/erf_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ERF_MOORE_API_H__
+#define __ERF_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erf, moore)
+
+#endif // __ERF_MOORE_API_H__
diff --git a/src/infiniop/ops/erf/moore/erf_moore.mu b/src/infiniop/ops/erf/moore/erf_moore.mu
new file mode 100644
index 000000000..1f717fa51
--- /dev/null
+++ b/src/infiniop/ops/erf/moore/erf_moore.mu
@@ -0,0 +1,60 @@
+#include "erf_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "erf_moore_kernel.h"
+
+namespace op::erf::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::ErfOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::ErfOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::ErfOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erf::moore
diff --git a/src/infiniop/ops/erf/moore/erf_moore_kernel.h b/src/infiniop/ops/erf/moore/erf_moore_kernel.h
new file mode 100644
index 000000000..8ddc9d5f1
--- /dev/null
+++ b/src/infiniop/ops/erf/moore/erf_moore_kernel.h
@@ -0,0 +1,36 @@
+#ifndef __ERF_MOORE_KERNEL_H__
+#define __ERF_MOORE_KERNEL_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
+
+namespace op::erf::moore {
+
+typedef struct ErfOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float x0 = __low2float(x);
+            float x1 = __high2float(x);
+            return __floats2half2_rn(erff(x0), erff(x1));
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(x);
+            return __float2half(erff(xf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(x);
+            return __float2bfloat16_rn(erff(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return erff(x);
+        } else { // double
+            return erf(x);
+        }
+    }
+} ErfOp;
+
+} // namespace op::erf::moore
+
+#endif // __ERF_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
new file mode 100644
index 000000000..03e14bb57
--- /dev/null
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
@@ -0,0 +1,58 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "erf_nvidia.cuh"
+
+namespace op::erf::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erf::nvidia
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
new file mode 100644
index 000000000..d20658027
--- /dev/null
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ERF_NVIDIA_H__
+#define __ERF_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(erf, nvidia)
+
+#endif // __ERF_NVIDIA_H__
diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc
new file mode 100644
index 000000000..f9b61e981
--- /dev/null
+++ b/src/infiniop/ops/erf/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/erf.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/erf_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/erf_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/erf_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/erf_moore.h"
+#endif
+
+__C __export infiniStatus_t infiniopCreateErfDescriptor(
+    infiniopHandle_t handle,
+    infiniopErfDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::erf::NAMESPACE::Descriptor::create(                      \
+            handle,                                                          \
+            reinterpret_cast<op::erf::NAMESPACE::Descriptor **>(desc_ptr),  \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::erf::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C __export infiniStatus_t infiniopErf(
+    infiniopErfDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc)  \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C __export infiniStatus_t
+infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/erfc/cpu/erfc_cpu.cc b/src/infiniop/ops/erfc/cpu/erfc_cpu.cc
new file mode 100644
index 000000000..35b82c678
--- /dev/null
+++ b/src/infiniop/ops/erfc/cpu/erfc_cpu.cc
@@ -0,0 +1,52 @@
+#include "erfc_cpu.h"
+
+namespace op::erfc::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ErfcOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ErfcOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ErfcOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ErfcOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfc::cpu
diff --git a/src/infiniop/ops/erfc/cpu/erfc_cpu.h b/src/infiniop/ops/erfc/cpu/erfc_cpu.h
new file mode 100644
index 000000000..dd6d69496
--- /dev/null
+++ b/src/infiniop/ops/erfc/cpu/erfc_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __ERFC_CPU_H__
+#define __ERFC_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(erfc, cpu)
+
+namespace op::erfc::cpu {
+typedef struct ErfcOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::erfc(x);
+    }
+} ErfcOp;
+} // namespace op::erfc::cpu
+
+#endif // __ERFC_CPU_H__
diff --git a/src/infiniop/ops/erfc/cuda/kernel.cuh b/src/infiniop/ops/erfc/cuda/kernel.cuh
new file mode 100644
index 000000000..aae8efcee
--- /dev/null
+++ b/src/infiniop/ops/erfc/cuda/kernel.cuh
@@ -0,0 +1,36 @@
+#pragma once
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <type_traits>
+
+namespace op::cuda {
+
+struct ErfcOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(T x) const {
+        if constexpr (std::is_same_v<T, float>) {
+            return erfcf(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return erfc(x);
+        } else {
+            // For F16/BF16: promote to float, compute, then cast back
+            float xf;
+            if constexpr (std::is_same_v<T, half>) {
+                xf = __half2float(x);
+                return __float2half_rn(erfcf(xf));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                xf = __bfloat162float(x);
+                return __float2bfloat16_rn(erfcf(xf));
+            } else {
+                xf = static_cast<float>(x);
+                return static_cast<T>(erfcf(xf));
+            }
+        }
+    }
+};
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/erfc/erfc.h b/src/infiniop/ops/erfc/erfc.h
new file mode 100644
index 000000000..9ee12fd43
--- /dev/null
+++ b/src/infiniop/ops/erfc/erfc.h
@@ -0,0 +1,8 @@
+#ifndef __ERFC_H__
+#define __ERFC_H__
+
+#include "../../elementwise/elementwise.h"
+
+#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erfc, NAMESPACE)
+
+#endif // __ERFC_H__
diff --git a/src/infiniop/ops/erfc/metax/erfc_metax.h b/src/infiniop/ops/erfc/metax/erfc_metax.h
new file mode 100644
index 000000000..438f00095
--- /dev/null
+++ b/src/infiniop/ops/erfc/metax/erfc_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ERFC_METAX_API_H__
+#define __ERFC_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erfc, metax)
+
+#endif // __ERFC_METAX_API_H__
diff --git a/src/infiniop/ops/erfc/metax/erfc_metax.maca b/src/infiniop/ops/erfc/metax/erfc_metax.maca
new file mode 100644
index 000000000..925e52ccf
--- /dev/null
+++ b/src/infiniop/ops/erfc/metax/erfc_metax.maca
@@ -0,0 +1,60 @@
+#include "erfc_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erfc::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfc::metax
diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.h b/src/infiniop/ops/erfc/moore/erfc_moore.h
new file mode 100644
index 000000000..d032e4305
--- /dev/null
+++ b/src/infiniop/ops/erfc/moore/erfc_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ERFC_MOORE_API_H__
+#define __ERFC_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erfc, moore)
+
+#endif // __ERFC_MOORE_API_H__
diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.mu b/src/infiniop/ops/erfc/moore/erfc_moore.mu
new file mode 100644
index 000000000..d1eaec1bf
--- /dev/null
+++ b/src/infiniop/ops/erfc/moore/erfc_moore.mu
@@ -0,0 +1,60 @@
+#include "erfc_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "erfc_moore_kernel.h"
+
+namespace op::erfc::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::ErfcOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::ErfcOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::ErfcOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfc::moore
diff --git a/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h b/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h
new file mode 100644
index 000000000..cd5225c3b
--- /dev/null
+++ b/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h
@@ -0,0 +1,36 @@
+#ifndef __ERFC_MOORE_KERNEL_H__
+#define __ERFC_MOORE_KERNEL_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
+
+namespace op::erfc::moore {
+
+typedef struct ErfcOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float x0 = __low2float(x);
+            float x1 = __high2float(x);
+            return __floats2half2_rn(erfcf(x0), erfcf(x1));
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(x);
+            return __float2half(erfcf(xf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(x);
+            return __float2bfloat16_rn(erfcf(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return erfcf(x);
+        } else { // double
+            return erfc(x);
+        }
+    }
+} ErfcOp;
+
+} // namespace op::erfc::moore
+
+#endif // __ERFC_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu
new file mode 100644
index 000000000..483f11a18
--- /dev/null
+++ b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cu
@@ -0,0 +1,58 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "erfc_nvidia.cuh"
+
+namespace op::erfc::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erfc::nvidia
diff --git a/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh
new file mode 100644
index 000000000..4d5321c9d
--- /dev/null
+++ b/src/infiniop/ops/erfc/nvidia/erfc_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ERFC_NVIDIA_H__
+#define __ERFC_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(erfc, nvidia)
+
+#endif // __ERFC_NVIDIA_H__
diff --git a/src/infiniop/ops/erfc/operator.cc b/src/infiniop/ops/erfc/operator.cc
new file mode 100644
index 000000000..fa102c90c
--- /dev/null
+++ b/src/infiniop/ops/erfc/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/erfc.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/erfc_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/erfc_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/erfc_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/erfc_moore.h"
+#endif
+
+__C __export infiniStatus_t infiniopCreateErfcDescriptor(
+    infiniopHandle_t handle,
+    infiniopErfcDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::erfc::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::erfc::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::erfc::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C __export infiniStatus_t infiniopErfc(
+    infiniopErfcDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::erfc::NAMESPACE::Descriptor *>(desc)  \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C __export infiniStatus_t
+infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::erfc::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc
new file mode 100644
index 000000000..16c5c8cba
--- /dev/null
+++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.cc
@@ -0,0 +1,52 @@
+#include "erfinv_cpu.h"
+
+namespace op::erfinv::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ErfinvOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ErfinvOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ErfinvOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ErfinvOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfinv::cpu
diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h
new file mode 100644
index 000000000..8cc218a1d
--- /dev/null
+++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h
@@ -0,0 +1,46 @@
+#ifndef __ERFINV_CPU_H__
+#define __ERFINV_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+#include <limits>
+
+ELEMENTWISE_DESCRIPTOR(erfinv, cpu)
+
+namespace op::erfinv::cpu {
+
+// Inverse error function implementation using Newton's method
+template <typename T>
+T erfinv_impl(T x) {
+    // Domain: x in (-1, 1)
+    if (x == 1.0) return std::numeric_limits<T>::infinity();
+    if (x == -1.0) return -std::numeric_limits<T>::infinity();
+    if (x > 1.0 || x < -1.0) return std::numeric_limits<T>::quiet_NaN();
+    if (x == 0.0) return 0.0;
+
+    // Use Newton's method to solve erf(y) = x
+    T y = x; // Initial guess
+    const int max_iter = 10;
+    const T tol = static_cast<T>(1e-10);
+
+    for (int i = 0; i < max_iter; ++i) {
+        T erf_y = std::erf(y);
+        T derf_dy = 2.0 / std::sqrt(3.14159265358979323846) * std::exp(-y * y);
+        T error = erf_y - x;
+        if (std::abs(error) < tol) break;
+        y = y - error / derf_dy;
+    }
+    return y;
+}
+
+typedef struct ErfinvOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return erfinv_impl(x);
+    }
+} ErfinvOp;
+} // namespace op::erfinv::cpu
+
+#endif // __ERFINV_CPU_H__
diff --git a/src/infiniop/ops/erfinv/cuda/kernel.cuh b/src/infiniop/ops/erfinv/cuda/kernel.cuh
new file mode 100644
index 000000000..4f4660e80
--- /dev/null
+++ b/src/infiniop/ops/erfinv/cuda/kernel.cuh
@@ -0,0 +1,113 @@
+#pragma once
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <math_constants.h>
+#include <type_traits>
+
+namespace op::cuda {
+
+// Inverse error function.
+//
+// We use a Winitzki-style approximation for an initial guess, then refine with
+// a few Newton iterations. Starting with y=x converges poorly for x close to 1,
+// which appears frequently in test inputs (torch.rand in [0,1)).
+__device__ __forceinline__ float erfinv_impl(float x) {
+    if (x == 1.0f) return CUDART_INF_F;
+    if (x == -1.0f) return -CUDART_INF_F;
+    if (x > 1.0f || x < -1.0f) return CUDART_NAN_F;
+    if (x == 0.0f) return 0.0f;
+
+    // Winitzki approximation (a = 0.147) for initial guess.
+    // See: https://arxiv.org/abs/math/0306301 (and common implementations).
+    const float a = 0.147f;
+    const float ln = log1pf(-x * x); // ln(1 - x^2) <= 0
+    const float t = 2.0f / (CUDART_PI_F * a) + ln * 0.5f;
+    float inside = t * t - ln / a;
+    inside = inside > 0.0f ? inside : 0.0f;
+    float y0 = copysignf(sqrtf(sqrtf(inside) - t), x);
+
+    // Fast path: a few Newton steps in float.
+    // This is sufficient for most x and much faster than always refining in double.
+    float y = y0;
+    const float sqrt_pi_f = 1.7724538509055159f; // sqrt(pi)
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        const float erf_y = erff(y);
+        const float derf_dy = 2.0f / sqrt_pi_f * expf(-y * y);
+        y = y - (erf_y - x) / derf_dy;
+    }
+
+    // Hybrid slow path: only for values extremely close to ±1 where float erf
+    // quantization can cause Newton iterations to stagnate, leading to noticeable
+    // absolute error in y (even if erff(y) == x in float).
+    //
+    // The threshold is chosen so the slow path is taken very rarely for typical
+    // random inputs, minimizing warp divergence and preserving performance.
+    const float ax = fabsf(x);
+    if (1.0f - ax < 1e-4f) {
+        const double xd = static_cast<double>(x);
+        double yd = static_cast<double>(y);
+        const double sqrt_pi = 1.7724538509055159; // sqrt(pi)
+#pragma unroll
+        for (int i = 0; i < 4; ++i) {
+            const double erf_y = erf(yd);
+            const double derf_dy = 2.0 / sqrt_pi * exp(-yd * yd);
+            yd = yd - (erf_y - xd) / derf_dy;
+        }
+        y = static_cast<float>(yd);
+    }
+
+    return y;
+}
+
+struct ErfinvOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(T x) const {
+        if constexpr (std::is_same_v<T, float>) {
+            return erfinv_impl(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            // For double, use similar approach
+            if (x == 1.0) return CUDART_INF;
+            if (x == -1.0) return -CUDART_INF;
+            if (x > 1.0 || x < -1.0) return CUDART_NAN;
+            if (x == 0.0) return 0.0;
+            const double a = 0.147;
+            const double ln = log1p(-x * x);
+            const double t = 2.0 / (CUDART_PI * a) + ln * 0.5;
+            double inside = t * t - ln / a;
+            inside = inside > 0.0 ? inside : 0.0;
+            double y = copysign(sqrt(sqrt(inside) - t), x);
+
+            const int max_iter = 30;
+            const double tol = 1e-14;
+            const double sqrt_pi = 1.7724538509055159;
+            for (int i = 0; i < max_iter; ++i) {
+                const double erf_y = erf(y);
+                const double error = erf_y - x;
+                if (fabs(error) < tol) break;
+                const double derf_dy = 2.0 / sqrt_pi * exp(-y * y);
+                y = y - error / derf_dy;
+            }
+            return y;
+        } else {
+            // For F16/BF16: promote to float, compute, then cast back
+            float xf;
+            if constexpr (std::is_same_v<T, half>) {
+                xf = __half2float(x);
+                return __float2half_rn(erfinv_impl(xf));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                xf = __bfloat162float(x);
+                return __float2bfloat16_rn(erfinv_impl(xf));
+            } else {
+                xf = static_cast<float>(x);
+                return static_cast<T>(erfinv_impl(xf));
+            }
+        }
+    }
+};
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/erfinv/erfinv.h b/src/infiniop/ops/erfinv/erfinv.h
new file mode 100644
index 000000000..f3ed9350f
--- /dev/null
+++ b/src/infiniop/ops/erfinv/erfinv.h
@@ -0,0 +1,8 @@
+#ifndef __ERFINV_H__
+#define __ERFINV_H__
+
+#include "../../elementwise/elementwise.h"
+
+#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erfinv, NAMESPACE)
+
+#endif // __ERFINV_H__
diff --git a/src/infiniop/ops/erfinv/metax/erfinv_metax.h b/src/infiniop/ops/erfinv/metax/erfinv_metax.h
new file mode 100644
index 000000000..05058bfc6
--- /dev/null
+++ b/src/infiniop/ops/erfinv/metax/erfinv_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ERFINV_METAX_API_H__
+#define __ERFINV_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erfinv, metax)
+
+#endif // __ERFINV_METAX_API_H__
diff --git a/src/infiniop/ops/erfinv/metax/erfinv_metax.maca b/src/infiniop/ops/erfinv/metax/erfinv_metax.maca
new file mode 100644
index 000000000..1e9144074
--- /dev/null
+++ b/src/infiniop/ops/erfinv/metax/erfinv_metax.maca
@@ -0,0 +1,60 @@
+#include "erfinv_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::erfinv::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfinv::metax
diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.h b/src/infiniop/ops/erfinv/moore/erfinv_moore.h
new file mode 100644
index 000000000..9eed18024
--- /dev/null
+++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.h
@@ -0,0 +1,8 @@
+#ifndef __ERFINV_MOORE_API_H__
+#define __ERFINV_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(erfinv, moore)
+
+#endif // __ERFINV_MOORE_API_H__
diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.mu b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu
new file mode 100644
index 000000000..54b5830ea
--- /dev/null
+++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu
@@ -0,0 +1,60 @@
+#include "erfinv_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "erfinv_moore_kernel.h"
+
+namespace op::erfinv::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, moore::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::ErfinvOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::ErfinvOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::ErfinvOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::erfinv::moore
diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h b/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h
new file mode 100644
index 000000000..e3f3bb5f2
--- /dev/null
+++ b/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h
@@ -0,0 +1,72 @@
+#ifndef __ERFINV_MOORE_KERNEL_H__
+#define __ERFINV_MOORE_KERNEL_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
+
+namespace op::erfinv::moore {
+
+// Inverse error function using Newton's method
+template <typename T>
+__device__ __forceinline__ T erfinv_impl(T x) {
+    if (x >= 1.0f) return CUDART_INF_F;
+    if (x <= -1.0f) return -CUDART_INF_F;
+    if (x == 0.0f) return 0.0f;
+
+    T y = x;
+    const int max_iter = 10;
+    const T tol = 1e-10f;
+    const T sqrt_pi = 1.7724538509055159f;
+
+    for (int i = 0; i < max_iter; ++i) {
+        T erf_y = erff(y);
+        T derf_dy = 2.0f / sqrt_pi * expf(-y * y);
+        T error = erf_y - x;
+        if (fabsf(error) < tol) break;
+        y = y - error / derf_dy;
+    }
+    return y;
+}
+
+typedef struct ErfinvOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float x0 = __low2float(x);
+            float x1 = __high2float(x);
+            return __floats2half2_rn(erfinv_impl(x0), erfinv_impl(x1));
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(x);
+            return __float2half(erfinv_impl(xf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(x);
+            return __float2bfloat16_rn(erfinv_impl(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return erfinv_impl(x);
+        } else { // double
+            if (x >= 1.0) return CUDART_INF;
+            if (x <= -1.0) return -CUDART_INF;
+            if (x == 0.0) return 0.0;
+            double y = x;
+            const int max_iter = 10;
+            const double tol = 1e-10;
+            const double sqrt_pi = 1.7724538509055159;
+            for (int i = 0; i < max_iter; ++i) {
+                double erf_y = erf(y);
+                double derf_dy = 2.0 / sqrt_pi * exp(-y * y);
+                double error = erf_y - x;
+                if (fabs(error) < tol) break;
+                y = y - error / derf_dy;
+            }
+            return y;
+        }
+    }
+} ErfinvOp;
+
+} // namespace op::erfinv::moore
+
+#endif // __ERFINV_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu
new file mode 100644
index 000000000..35f5d3fe2
--- /dev/null
+++ b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cu
@@ -0,0 +1,58 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "erfinv_nvidia.cuh"
+
+namespace op::erfinv::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erfinv::nvidia
diff --git a/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh
new file mode 100644
index 000000000..af80be12f
--- /dev/null
+++ b/src/infiniop/ops/erfinv/nvidia/erfinv_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ERFINV_NVIDIA_H__
+#define __ERFINV_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(erfinv, nvidia)
+
+#endif // __ERFINV_NVIDIA_H__
diff --git a/src/infiniop/ops/erfinv/operator.cc b/src/infiniop/ops/erfinv/operator.cc
new file mode 100644
index 000000000..c7c360bec
--- /dev/null
+++ b/src/infiniop/ops/erfinv/operator.cc
@@ -0,0 +1,157 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/erfinv.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/erfinv_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/erfinv_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/erfinv_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/erfinv_moore.h"
+#endif
+
+__C __export infiniStatus_t infiniopCreateErfinvDescriptor(
+    infiniopHandle_t handle,
+    infiniopErfinvDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                \
+        return op::erfinv::NAMESPACE::Descriptor::create(                    \
+            handle,                                                           \
+            reinterpret_cast<op::erfinv::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                           \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                   \
+    case CASE:                                                                                 \
+        *size = reinterpret_cast<op::erfinv::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C __export infiniStatus_t infiniopErfinv(
+    infiniopErfinvDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                   \
+        return reinterpret_cast<const op::erfinv::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C __export infiniStatus_t
+infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        delete reinterpret_cast<const op::erfinv::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc
new file mode 100644
index 000000000..22a792012
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc
@@ -0,0 +1,201 @@
+#include "matrix_power_cpu.h"
+#include "../../../utils.h"
+#include <cstring>
+#include <algorithm>
+
+namespace op::matrix_power::cpu {
+
+utils::Result<MatrixPowerInfo> MatrixPowerInfo::create(
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    int n) {
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (y_shape != x_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (n < 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    MatrixPowerInfo info;
+    info.matrix_size = x_shape[0];
+    info.n = static_cast<size_t>(n);
+    info.input_size = x_desc->numel();
+    info.output_size = y_desc->numel();
+
+    return utils::Result<MatrixPowerInfo>(std::move(info));
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    auto info_result = MatrixPowerInfo::create(x_desc, y_desc, n);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void matrix_power_impl(
+    const MatrixPowerInfo &info,
+    T *y,
+    const T *x,
+    void *workspace) {
+
+    size_t n = info.matrix_size;
+    int power = static_cast<int>(info.n);
+
+    // Use workspace for temporary matrices
+    T *temp1 = reinterpret_cast<T *>(workspace);
+    T *temp2 = temp1 + n * n;
+
+    // Initialize result as identity matrix
+    std::memset(y, 0, n * n * sizeof(T));
+    for (size_t i = 0; i < n; ++i) {
+        y[i * n + i] = utils::cast<T>(1.0);
+    }
+
+    // Copy input to temp1
+    std::memcpy(temp1, x, n * n * sizeof(T));
+
+    // Binary exponentiation
+    while (power > 0) {
+        if (power & 1) {
+            // Multiply result by temp1
+            std::memset(temp2, 0, n * n * sizeof(T));
+            for (size_t i = 0; i < n; ++i) {
+                for (size_t k = 0; k < n; ++k) {
+                    T val = y[i * n + k];
+                    for (size_t j = 0; j < n; ++j) {
+                        temp2[i * n + j] += val * temp1[k * n + j];
+                    }
+                }
+            }
+            std::memcpy(y, temp2, n * n * sizeof(T));
+        }
+        // Square temp1
+        std::memset(temp2, 0, n * n * sizeof(T));
+        for (size_t i = 0; i < n; ++i) {
+            for (size_t k = 0; k < n; ++k) {
+                T val = temp1[i * n + k];
+                for (size_t j = 0; j < n; ++j) {
+                    temp2[i * n + j] += val * temp1[k * n + j];
+                }
+            }
+        }
+        std::memcpy(temp1, temp2, n * n * sizeof(T));
+        power >>= 1;
+    }
+}
+
+template <typename T>
+void write_identity_impl(size_t n, T *y) {
+    std::fill(y, y + n * n, utils::cast<T>(0.0));
+    for (size_t i = 0; i < n; ++i) {
+        y[i * n + i] = utils::cast<T>(1.0);
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (_info.matrix_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    if (_info.n == 0) {
+        const size_t n = _info.matrix_size;
+        switch (_dtype) {
+        case INFINI_DTYPE_F16:
+            write_identity_impl<fp16_t>(n, reinterpret_cast<fp16_t *>(y));
+            return INFINI_STATUS_SUCCESS;
+        case INFINI_DTYPE_BF16:
+            write_identity_impl<bf16_t>(n, reinterpret_cast<bf16_t *>(y));
+            return INFINI_STATUS_SUCCESS;
+        case INFINI_DTYPE_F32:
+            write_identity_impl<float>(n, reinterpret_cast<float *>(y));
+            return INFINI_STATUS_SUCCESS;
+        case INFINI_DTYPE_F64:
+            write_identity_impl<double>(n, reinterpret_cast<double *>(y));
+            return INFINI_STATUS_SUCCESS;
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    }
+
+    if (workspace_size < this->workspaceSize()) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    if (this->workspaceSize() > 0 && workspace == nullptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16: {
+        // Convert to float for computation
+        std::vector<float> x_f(_info.input_size);
+        std::vector<float> y_f(_info.output_size);
+        float *workspace_f = reinterpret_cast<float *>(workspace);
+        for (size_t i = 0; i < _info.input_size; ++i) {
+            x_f[i] = utils::cast<float>(reinterpret_cast<const fp16_t *>(x)[i]);
+        }
+        MatrixPowerInfo info_f = _info;
+        matrix_power_impl<float>(info_f, y_f.data(), x_f.data(), workspace_f);
+        for (size_t i = 0; i < _info.output_size; ++i) {
+            reinterpret_cast<fp16_t *>(y)[i] = utils::cast<fp16_t>(y_f[i]);
+        }
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        std::vector<float> x_f(_info.input_size);
+        std::vector<float> y_f(_info.output_size);
+        float *workspace_f = reinterpret_cast<float *>(workspace);
+        for (size_t i = 0; i < _info.input_size; ++i) {
+            x_f[i] = utils::cast<float>(reinterpret_cast<const bf16_t *>(x)[i]);
+        }
+        MatrixPowerInfo info_f = _info;
+        matrix_power_impl<float>(info_f, y_f.data(), x_f.data(), workspace_f);
+        for (size_t i = 0; i < _info.output_size; ++i) {
+            reinterpret_cast<bf16_t *>(y)[i] = utils::cast<bf16_t>(y_f[i]);
+        }
+        break;
+    }
+    case INFINI_DTYPE_F32:
+        matrix_power_impl<float>(_info, reinterpret_cast<float *>(y), reinterpret_cast<const float *>(x), workspace);
+        break;
+    case INFINI_DTYPE_F64:
+        matrix_power_impl<double>(_info, reinterpret_cast<double *>(y), reinterpret_cast<const double *>(x), workspace);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::matrix_power::cpu
diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h
new file mode 100644
index 000000000..9c6f2ebc1
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h
@@ -0,0 +1,69 @@
+#ifndef __MATRIX_POWER_CPU_H__
+#define __MATRIX_POWER_CPU_H__
+
+#include "../../../operator.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <vector>
+
+namespace op::matrix_power::cpu {
+
+struct MatrixPowerInfo {
+    size_t matrix_size;  // N x N matrix
+    size_t n;            // Power
+    size_t input_size;
+    size_t output_size;
+
+    static utils::Result<MatrixPowerInfo> create(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        int n);
+};
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    MatrixPowerInfo _info;
+
+    Descriptor(infiniDtype_t dtype, MatrixPowerInfo info,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int n);
+
+    size_t workspaceSize() const {
+        if (_info.n == 0 || _info.matrix_size == 0) {
+            return 0;
+        }
+        const size_t elems = 2 * _info.matrix_size * _info.matrix_size;
+        switch (_dtype) {
+        case INFINI_DTYPE_F16:
+        case INFINI_DTYPE_BF16:
+        case INFINI_DTYPE_F32:
+            return elems * sizeof(float);
+        case INFINI_DTYPE_F64:
+            return elems * sizeof(double);
+        default:
+            return 0;
+        }
+    }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::matrix_power::cpu
+
+#endif // __MATRIX_POWER_CPU_H__
diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h
new file mode 100644
index 000000000..02ff4b14d
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h
@@ -0,0 +1,48 @@
+#ifndef __MATRIX_POWER_METAX_H__
+#define __MATRIX_POWER_METAX_H__
+
+#include "../../../operator.h"
+#include "../../../devices/metax/metax_common.h"
+
+namespace op::matrix_power::metax {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t matrix_size;
+    size_t n;
+    size_t input_size;
+    size_t output_size;
+
+    Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n,
+               size_t input_size, size_t output_size,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          matrix_size(matrix_size),
+          n(n),
+          input_size(input_size),
+          output_size(output_size) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int n);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::matrix_power::metax
+
+#endif // __MATRIX_POWER_METAX_H__
diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca
new file mode 100644
index 000000000..0e7f58f19
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca
@@ -0,0 +1,107 @@
+#include "matrix_power_metax.h"
+#include "../../../utils.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include <cuda_runtime.h>
+#include <vector>
+#include <cstring>
+
+namespace op::matrix_power::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (y_shape != x_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    CHECK_OR_RETURN(n >= 0, INFINI_STATUS_BAD_PARAM);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    *desc_ptr = new Descriptor(dtype, x_shape[0], static_cast<size_t>(n),
+                               x_desc->numel(), y_desc->numel(),
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (workspace_size < this->workspaceSize()) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    CHECK_OR_RETURN(_dtype == INFINI_DTYPE_F32, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    size_t input_bytes = input_size * sizeof(float);
+
+    // Use CPU fallback for now
+    std::vector<float> h_matrix(input_size);
+    CHECK_METAX(hcMemcpyAsync(h_matrix.data(), x, input_bytes, hcMemcpyDeviceToHost, hc_stream));
+    CHECK_METAX(hcStreamSynchronize(hc_stream));
+
+    std::vector<float> result(output_size, 0.0f);
+    std::vector<float> temp1(input_size);
+    std::vector<float> temp2(input_size);
+    std::memcpy(temp1.data(), h_matrix.data(), input_bytes);
+
+    for (size_t i = 0; i < matrix_size; ++i) {
+        result[i * matrix_size + i] = 1.0f;
+    }
+
+    int power = static_cast<int>(n);
+    while (power > 0) {
+        if (power & 1) {
+            std::fill(temp2.begin(), temp2.end(), 0.0f);
+            for (size_t i = 0; i < matrix_size; ++i) {
+                for (size_t k = 0; k < matrix_size; ++k) {
+                    float val = result[i * matrix_size + k];
+                    for (size_t j = 0; j < matrix_size; ++j) {
+                        temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j];
+                    }
+                }
+            }
+            std::memcpy(result.data(), temp2.data(), output_size * sizeof(float));
+        }
+        std::fill(temp2.begin(), temp2.end(), 0.0f);
+        for (size_t i = 0; i < matrix_size; ++i) {
+            for (size_t k = 0; k < matrix_size; ++k) {
+                float val = temp1[i * matrix_size + k];
+                for (size_t j = 0; j < matrix_size; ++j) {
+                    temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j];
+                }
+            }
+        }
+        std::memcpy(temp1.data(), temp2.data(), input_bytes);
+        power >>= 1;
+    }
+
+    CHECK_METAX(hcMemcpyAsync(y, result.data(), input_bytes, hcMemcpyHostToDevice, hc_stream));
+    CHECK_METAX(hcStreamSynchronize(hc_stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::matrix_power::metax
diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h
new file mode 100644
index 000000000..a58428a6e
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h
@@ -0,0 +1,48 @@
+#ifndef __MATRIX_POWER_MOORE_H__
+#define __MATRIX_POWER_MOORE_H__
+
+#include "../../../operator.h"
+#include "../../../devices/moore/moore_common.h"
+
+namespace op::matrix_power::moore {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t matrix_size;
+    size_t n;
+    size_t input_size;
+    size_t output_size;
+
+    Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n,
+               size_t input_size, size_t output_size,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          matrix_size(matrix_size),
+          n(n),
+          input_size(input_size),
+          output_size(output_size) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int n);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::matrix_power::moore
+
+#endif // __MATRIX_POWER_MOORE_H__
diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu
new file mode 100644
index 000000000..532480955
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu
@@ -0,0 +1,106 @@
+#include "matrix_power_moore.h"
+#include "../../../utils.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include <cuda_runtime.h>
+#include <vector>
+#include <cstring>
+
+namespace op::matrix_power::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (y_shape != x_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    CHECK_OR_RETURN(n >= 0, INFINI_STATUS_BAD_PARAM);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    *desc_ptr = new Descriptor(dtype, x_shape[0], static_cast<size_t>(n),
+                               x_desc->numel(), y_desc->numel(),
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (workspace_size < this->workspaceSize()) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    CHECK_OR_RETURN(_dtype == INFINI_DTYPE_F32, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    size_t input_bytes = input_size * sizeof(float);
+
+    std::vector<float> h_matrix(input_size);
+    CHECK_MOORE(musaMemcpyAsync(h_matrix.data(), x, input_bytes, musaMemcpyDeviceToHost, musa_stream));
+    CHECK_MOORE(musaStreamSynchronize(musa_stream));
+
+    std::vector<float> result(output_size, 0.0f);
+    std::vector<float> temp1(input_size);
+    std::vector<float> temp2(input_size);
+    std::memcpy(temp1.data(), h_matrix.data(), input_bytes);
+
+    for (size_t i = 0; i < matrix_size; ++i) {
+        result[i * matrix_size + i] = 1.0f;
+    }
+
+    int power = static_cast<int>(n);
+    while (power > 0) {
+        if (power & 1) {
+            std::fill(temp2.begin(), temp2.end(), 0.0f);
+            for (size_t i = 0; i < matrix_size; ++i) {
+                for (size_t k = 0; k < matrix_size; ++k) {
+                    float val = result[i * matrix_size + k];
+                    for (size_t j = 0; j < matrix_size; ++j) {
+                        temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j];
+                    }
+                }
+            }
+            std::memcpy(result.data(), temp2.data(), output_size * sizeof(float));
+        }
+        std::fill(temp2.begin(), temp2.end(), 0.0f);
+        for (size_t i = 0; i < matrix_size; ++i) {
+            for (size_t k = 0; k < matrix_size; ++k) {
+                float val = temp1[i * matrix_size + k];
+                for (size_t j = 0; j < matrix_size; ++j) {
+                    temp2[i * matrix_size + j] += val * temp1[k * matrix_size + j];
+                }
+            }
+        }
+        std::memcpy(temp1.data(), temp2.data(), input_bytes);
+        power >>= 1;
+    }
+
+    CHECK_MOORE(musaMemcpyAsync(y, result.data(), input_bytes, musaMemcpyHostToDevice, musa_stream));
+    CHECK_MOORE(musaStreamSynchronize(musa_stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::matrix_power::moore
diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu
new file mode 100644
index 000000000..e6164c8ca
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu
@@ -0,0 +1,548 @@
+#include "matrix_power_nvidia.cuh"
+#include "../../../utils.h"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <limits>
+#include <utility>
+
+namespace op::matrix_power::nvidia {
+
+namespace {
+
+template <typename T>
+__forceinline__ __device__ T identityZero();
+
+template <typename T>
+__forceinline__ __device__ T identityOne();
+
+template <>
+__forceinline__ __device__ __half identityZero<__half>() {
+    return __float2half(0.0f);
+}
+
+template <>
+__forceinline__ __device__ __half identityOne<__half>() {
+    return __float2half(1.0f);
+}
+
+template <>
+__forceinline__ __device__ cuda_bfloat16 identityZero<cuda_bfloat16>() {
+    return __float2bfloat16(0.0f);
+}
+
+template <>
+__forceinline__ __device__ cuda_bfloat16 identityOne<cuda_bfloat16>() {
+    return __float2bfloat16(1.0f);
+}
+
+template <>
+__forceinline__ __device__ float identityZero<float>() {
+    return 0.0f;
+}
+
+template <>
+__forceinline__ __device__ float identityOne<float>() {
+    return 1.0f;
+}
+
+template <>
+__forceinline__ __device__ double identityZero<double>() {
+    return 0.0;
+}
+
+template <>
+__forceinline__ __device__ double identityOne<double>() {
+    return 1.0;
+}
+
+template <typename T>
+INFINIOP_CUDA_KERNEL packMatrix2dStridedToContiguous(
+    const T *src,
+    T *dst,
+    size_t matrix_size,
+    ptrdiff_t src_stride_0,
+    ptrdiff_t src_stride_1) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t matrix_numel = matrix_size * matrix_size;
+    if (idx < matrix_numel) {
+        size_t row = idx / matrix_size;
+        size_t col = idx - row * matrix_size;
+        dst[idx] = src[row * src_stride_0 + col * src_stride_1];
+    }
+}
+
+template <typename T>
+INFINIOP_CUDA_KERNEL scatterMatrix2dContiguousToStrided(
+    const T *src,
+    T *dst,
+    size_t matrix_size,
+    ptrdiff_t dst_stride_0,
+    ptrdiff_t dst_stride_1) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t matrix_numel = matrix_size * matrix_size;
+    if (idx < matrix_numel) {
+        size_t row = idx / matrix_size;
+        size_t col = idx - row * matrix_size;
+        dst[row * dst_stride_0 + col * dst_stride_1] = src[idx];
+    }
+}
+
+template <typename T>
+INFINIOP_CUDA_KERNEL setIdentity2dStrided(
+    T *out,
+    size_t matrix_size,
+    ptrdiff_t out_stride_0,
+    ptrdiff_t out_stride_1) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t matrix_numel = matrix_size * matrix_size;
+    if (idx < matrix_numel) {
+        size_t row = idx / matrix_size;
+        size_t col = idx - row * matrix_size;
+        out[row * out_stride_0 + col * out_stride_1] = (row == col) ? identityOne<T>() : identityZero<T>();
+    }
+}
+
+INFINIOP_CUDA_KERNEL setDiagonalFp16(__half *out, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        out[idx * n + idx] = __float2half(1.0f);
+    }
+}
+
+INFINIOP_CUDA_KERNEL setDiagonalBf16(cuda_bfloat16 *out, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        out[idx * n + idx] = __float2bfloat16(1.0f);
+    }
+}
+
+INFINIOP_CUDA_KERNEL setDiagonalFp32(float *out, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        out[idx * n + idx] = 1.0f;
+    }
+}
+
+INFINIOP_CUDA_KERNEL setDiagonalFp64(double *out, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        out[idx * n + idx] = 1.0;
+    }
+}
+
+infiniStatus_t initializeIdentity(
+    void *y,
+    infiniDtype_t dtype,
+    size_t matrix_size,
+    bool y_contiguous,
+    ptrdiff_t y_stride_0,
+    ptrdiff_t y_stride_1,
+    cudaStream_t stream) {
+
+    if (matrix_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    constexpr int threads = 256;
+    size_t diag_blocks = CEIL_DIV(matrix_size, static_cast<size_t>(threads));
+    size_t matrix_numel = matrix_size * matrix_size;
+    size_t matrix_blocks = CEIL_DIV(matrix_numel, static_cast<size_t>(threads));
+
+    if (y_contiguous) {
+        CHECK_CUDA(cudaMemsetAsync(y, 0, matrix_numel * infiniSizeOf(dtype), stream));
+    }
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        if (y_contiguous) {
+            setDiagonalFp16<<<static_cast<unsigned int>(diag_blocks), threads, 0, stream>>>(
+                reinterpret_cast<__half *>(y), matrix_size);
+        } else {
+            setIdentity2dStrided<<<static_cast<unsigned int>(matrix_blocks), threads, 0, stream>>>(
+                reinterpret_cast<__half *>(y), matrix_size, y_stride_0, y_stride_1);
+        }
+        break;
+    case INFINI_DTYPE_BF16:
+        if (y_contiguous) {
+            setDiagonalBf16<<<static_cast<unsigned int>(diag_blocks), threads, 0, stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(y), matrix_size);
+        } else {
+            setIdentity2dStrided<<<static_cast<unsigned int>(matrix_blocks), threads, 0, stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(y), matrix_size, y_stride_0, y_stride_1);
+        }
+        break;
+    case INFINI_DTYPE_F32:
+        if (y_contiguous) {
+            setDiagonalFp32<<<static_cast<unsigned int>(diag_blocks), threads, 0, stream>>>(
+                reinterpret_cast<float *>(y), matrix_size);
+        } else {
+            setIdentity2dStrided<<<static_cast<unsigned int>(matrix_blocks), threads, 0, stream>>>(
+                reinterpret_cast<float *>(y), matrix_size, y_stride_0, y_stride_1);
+        }
+        break;
+    case INFINI_DTYPE_F64:
+        if (y_contiguous) {
+            setDiagonalFp64<<<static_cast<unsigned int>(diag_blocks), threads, 0, stream>>>(
+                reinterpret_cast<double *>(y), matrix_size);
+        } else {
+            setIdentity2dStrided<<<static_cast<unsigned int>(matrix_blocks), threads, 0, stream>>>(
+                reinterpret_cast<double *>(y), matrix_size, y_stride_0, y_stride_1);
+        }
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    CHECK_CUDA(cudaGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t packInputToContiguous(
+    void *dst,
+    const void *src,
+    infiniDtype_t dtype,
+    size_t matrix_size,
+    ptrdiff_t src_stride_0,
+    ptrdiff_t src_stride_1,
+    cudaStream_t stream) {
+    constexpr int threads = 256;
+    size_t matrix_numel = matrix_size * matrix_size;
+    size_t blocks = CEIL_DIV(matrix_numel, static_cast<size_t>(threads));
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        packMatrix2dStridedToContiguous<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const __half *>(src), reinterpret_cast<__half *>(dst),
+            matrix_size, src_stride_0, src_stride_1);
+        break;
+    case INFINI_DTYPE_BF16:
+        packMatrix2dStridedToContiguous<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const cuda_bfloat16 *>(src), reinterpret_cast<cuda_bfloat16 *>(dst),
+            matrix_size, src_stride_0, src_stride_1);
+        break;
+    case INFINI_DTYPE_F32:
+        packMatrix2dStridedToContiguous<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst),
+            matrix_size, src_stride_0, src_stride_1);
+        break;
+    case INFINI_DTYPE_F64:
+        packMatrix2dStridedToContiguous<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const double *>(src), reinterpret_cast<double *>(dst),
+            matrix_size, src_stride_0, src_stride_1);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    CHECK_CUDA(cudaGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t scatterContiguousToOutput(
+    void *dst,
+    const void *src,
+    infiniDtype_t dtype,
+    size_t matrix_size,
+    ptrdiff_t dst_stride_0,
+    ptrdiff_t dst_stride_1,
+    cudaStream_t stream) {
+    constexpr int threads = 256;
+    size_t matrix_numel = matrix_size * matrix_size;
+    size_t blocks = CEIL_DIV(matrix_numel, static_cast<size_t>(threads));
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        scatterMatrix2dContiguousToStrided<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const __half *>(src), reinterpret_cast<__half *>(dst),
+            matrix_size, dst_stride_0, dst_stride_1);
+        break;
+    case INFINI_DTYPE_BF16:
+        scatterMatrix2dContiguousToStrided<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const cuda_bfloat16 *>(src), reinterpret_cast<cuda_bfloat16 *>(dst),
+            matrix_size, dst_stride_0, dst_stride_1);
+        break;
+    case INFINI_DTYPE_F32:
+        scatterMatrix2dContiguousToStrided<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst),
+            matrix_size, dst_stride_0, dst_stride_1);
+        break;
+    case INFINI_DTYPE_F64:
+        scatterMatrix2dContiguousToStrided<<<static_cast<unsigned int>(blocks), threads, 0, stream>>>(
+            reinterpret_cast<const double *>(src), reinterpret_cast<double *>(dst),
+            matrix_size, dst_stride_0, dst_stride_1);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    CHECK_CUDA(cudaGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+using GemmComputeType = cudaDataType;
+#else
+using GemmComputeType = cublasComputeType_t;
+#endif
+
+struct GemmTypeConfig {
+    cudaDataType io_type;
+    GemmComputeType compute_type;
+};
+
+infiniStatus_t getGemmTypeConfig(infiniDtype_t dtype, GemmTypeConfig &cfg) {
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        cfg.io_type = CUDA_R_16F;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        cfg.compute_type = CUDA_R_32F;
+#else
+        cfg.compute_type = CUBLAS_COMPUTE_32F;
+#endif
+        return INFINI_STATUS_SUCCESS;
+    case INFINI_DTYPE_BF16:
+        cfg.io_type = CUDA_R_16BF;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        cfg.compute_type = CUDA_R_32F;
+#else
+        cfg.compute_type = CUBLAS_COMPUTE_32F;
+#endif
+        return INFINI_STATUS_SUCCESS;
+    case INFINI_DTYPE_F32:
+        cfg.io_type = CUDA_R_32F;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        cfg.compute_type = CUDA_R_32F;
+#else
+        cfg.compute_type = CUBLAS_COMPUTE_32F;
+#endif
+        return INFINI_STATUS_SUCCESS;
+    case INFINI_DTYPE_F64:
+        cfg.io_type = CUDA_R_64F;
+#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+        cfg.compute_type = CUDA_R_64F;
+#else
+        cfg.compute_type = CUBLAS_COMPUTE_64F;
+#endif
+        return INFINI_STATUS_SUCCESS;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+// Compute row-major C = A * B using cuBLAS column-major GEMM:
+// C_col = B_col * A_col, where *_col views the same memory as column-major.
+infiniStatus_t gemmRowMajorSquare(
+    cublasHandle_t handle,
+    const GemmTypeConfig &cfg,
+    infiniDtype_t dtype,
+    int n,
+    const void *a,
+    const void *b,
+    void *c) {
+
+    if (dtype == INFINI_DTYPE_F64) {
+        const double alpha = 1.0;
+        const double beta = 0.0;
+        CHECK_CUBLAS(cublasGemmEx(
+            handle,
+            CUBLAS_OP_N,
+            CUBLAS_OP_N,
+            n,
+            n,
+            n,
+            &alpha,
+            b,
+            cfg.io_type,
+            n,
+            a,
+            cfg.io_type,
+            n,
+            &beta,
+            c,
+            cfg.io_type,
+            n,
+            cfg.compute_type,
+            CUBLAS_GEMM_DEFAULT));
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    CHECK_CUBLAS(cublasGemmEx(
+        handle,
+        CUBLAS_OP_N,
+        CUBLAS_OP_N,
+        n,
+        n,
+        n,
+        &alpha,
+        b,
+        cfg.io_type,
+        n,
+        a,
+        cfg.io_type,
+        n,
+        &beta,
+        c,
+        cfg.io_type,
+        n,
+        cfg.compute_type,
+        CUBLAS_GEMM_DEFAULT));
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_)
+        : internal(internal_) {}
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+    if (handle == nullptr || desc_ptr == nullptr || y_desc == nullptr || x_desc == nullptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    if (n < 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    if (y_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 2 || x_shape[0] != x_shape[1]) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    if (y_shape != x_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x_desc->hasBroadcastDim() || y_desc->hasBroadcastDim()) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+    if (x_shape[0] > static_cast<size_t>(std::numeric_limits<int>::max())) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_strides = x_desc->strides();
+    auto y_strides = y_desc->strides();
+    bool x_contiguous = x_desc->isContiguous();
+    bool y_contiguous = y_desc->isContiguous();
+
+    size_t matrix_numel = x_desc->numel();
+    size_t matrix_bytes = matrix_numel * infiniSizeOf(dtype);
+    size_t workspace_size = 0;
+    if (n != 0) {
+        workspace_size = matrix_bytes * (y_contiguous ? 2 : 3);
+    }
+
+    auto handle_nvidia = reinterpret_cast<device::nvidia::Handle *>(handle);
+    Descriptor *desc = new Descriptor(dtype, x_shape[0], static_cast<size_t>(n),
+                                      matrix_numel, y_desc->numel(), workspace_size,
+                                      x_strides[0], x_strides[1], y_strides[0], y_strides[1],
+                                      x_contiguous, y_contiguous,
+                                      handle->device, handle->device_id);
+    desc->_opaque = new Opaque(handle_nvidia->internal());
+    *desc_ptr = desc;
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (x == nullptr || y == nullptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    if (workspace_size < this->workspaceSize()) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    if (this->workspaceSize() != 0 && workspace == nullptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    if (matrix_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    if (n == 0) {
+        CHECK_STATUS(initializeIdentity(
+            y, _dtype, matrix_size, y_contiguous, y_stride_0, y_stride_1, cuda_stream));
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    size_t matrix_bytes = input_size * infiniSizeOf(_dtype);
+    char *workspace_ptr = reinterpret_cast<char *>(workspace);
+    void *base = workspace_ptr;
+    void *temp = workspace_ptr + matrix_bytes;
+    void *contiguous_output = y_contiguous ? y : (workspace_ptr + matrix_bytes * 2);
+
+    CHECK_STATUS(initializeIdentity(
+        contiguous_output, _dtype, matrix_size, true, 0, 0, cuda_stream));
+
+    if (x_contiguous) {
+        CHECK_CUDA(cudaMemcpyAsync(base, x, matrix_bytes, cudaMemcpyDeviceToDevice, cuda_stream));
+    } else {
+        CHECK_STATUS(packInputToContiguous(
+            base, x, _dtype, matrix_size, x_stride_0, x_stride_1, cuda_stream));
+    }
+
+    GemmTypeConfig cfg;
+    CHECK_STATUS(getGemmTypeConfig(_dtype, cfg));
+
+    void *result = contiguous_output;
+    void *scratch = temp;
+    void *base_matrix = base;
+    size_t power = n;
+    int matrix_dim = static_cast<int>(matrix_size);
+
+    CHECK_STATUS(_opaque->internal->useCublas(
+        cuda_stream,
+        [&](cublasHandle_t handle) {
+            while (power > 0) {
+                if (power & 1) {
+                    CHECK_STATUS(gemmRowMajorSquare(handle, cfg, _dtype, matrix_dim, result, base_matrix, scratch));
+                    std::swap(result, scratch);
+                }
+                power >>= 1;
+                if (power == 0) {
+                    break;
+                }
+                CHECK_STATUS(gemmRowMajorSquare(handle, cfg, _dtype, matrix_dim, base_matrix, base_matrix, scratch));
+                std::swap(base_matrix, scratch);
+            }
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    if (y_contiguous) {
+        if (result != y) {
+            CHECK_CUDA(cudaMemcpyAsync(y, result, matrix_bytes, cudaMemcpyDeviceToDevice, cuda_stream));
+        }
+    } else {
+        CHECK_STATUS(scatterContiguousToOutput(
+            y, result, _dtype, matrix_size, y_stride_0, y_stride_1, cuda_stream));
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::matrix_power::nvidia
diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh
new file mode 100644
index 000000000..ebe0ecbdf
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh
@@ -0,0 +1,72 @@
+#ifndef __MATRIX_POWER_NVIDIA_H__
+#define __MATRIX_POWER_NVIDIA_H__
+
+#include "../../../operator.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include <cstddef>
+
+namespace op::matrix_power::nvidia {
+
+class Descriptor final : public InfiniopDescriptor {
+    struct Opaque;
+    Opaque *_opaque;
+    infiniDtype_t _dtype;
+    size_t matrix_size;
+    size_t n;
+    size_t input_size;
+    size_t output_size;
+    size_t workspace_size;
+    ptrdiff_t x_stride_0;
+    ptrdiff_t x_stride_1;
+    ptrdiff_t y_stride_0;
+    ptrdiff_t y_stride_1;
+    bool x_contiguous;
+    bool y_contiguous;
+
+    Descriptor(infiniDtype_t dtype, size_t matrix_size, size_t n,
+               size_t input_size, size_t output_size,
+               size_t workspace_size,
+               ptrdiff_t x_stride_0, ptrdiff_t x_stride_1,
+               ptrdiff_t y_stride_0, ptrdiff_t y_stride_1,
+               bool x_contiguous, bool y_contiguous,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _opaque(nullptr),
+          _dtype(dtype),
+          matrix_size(matrix_size),
+          n(n),
+          input_size(input_size),
+          output_size(output_size),
+          workspace_size(workspace_size),
+          x_stride_0(x_stride_0),
+          x_stride_1(x_stride_1),
+          y_stride_0(y_stride_0),
+          y_stride_1(y_stride_1),
+          x_contiguous(x_contiguous),
+          y_contiguous(y_contiguous) {}
+
+public:
+    ~Descriptor();
+
+    friend struct Opaque;
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int n);
+
+    size_t workspaceSize() const { return workspace_size; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::matrix_power::nvidia
+
+#endif // __MATRIX_POWER_NVIDIA_H__
diff --git a/src/infiniop/ops/matrix_power/operator.cc b/src/infiniop/ops/matrix_power/operator.cc
new file mode 100644
index 000000000..d26e26fd1
--- /dev/null
+++ b/src/infiniop/ops/matrix_power/operator.cc
@@ -0,0 +1,159 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/matrix_power.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/matrix_power_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/matrix_power_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/matrix_power_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/matrix_power_moore.h"
+#endif
+
+__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(
+    infiniopHandle_t handle,
+    infiniopMatrixPowerDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int n) {
+
+#define CREATE(CASE, NAMESPACE)                                                              \
+    case CASE:                                                                               \
+        return op::matrix_power::NAMESPACE::Descriptor::create(                              \
+            handle,                                                                          \
+            reinterpret_cast<op::matrix_power::NAMESPACE::Descriptor **>(desc_ptr),         \
+            y_desc,                                                                          \
+            x_desc,                                                                          \
+            n)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                              \
+    case CASE:                                                                                            \
+        *size = reinterpret_cast<op::matrix_power::NAMESPACE::Descriptor *>(desc)->workspaceSize();     \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C __export infiniStatus_t infiniopMatrixPower(
+    infiniopMatrixPowerDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                              \
+    case CASE:                                                                                  \
+        return reinterpret_cast<const op::matrix_power::NAMESPACE::Descriptor *>(desc)         \
+            ->calculate(workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C __export infiniStatus_t
+infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                   \
+        delete reinterpret_cast<const op::matrix_power::NAMESPACE::Descriptor *>(desc);        \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc
new file mode 100644
index 000000000..19c1de74b
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc
@@ -0,0 +1,137 @@
+#include "pixel_shuffle_cpu.h"
+#include "../../../utils.h"
+
+namespace op::pixel_shuffle::cpu {
+
+utils::Result<PixelShuffleInfo> PixelShuffleInfo::create(
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t y_desc,
+    int upscale_factor) {
+
+    if (upscale_factor <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 4) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t batch = x_shape[0];
+    size_t in_channels = x_shape[1];
+    size_t height = x_shape[2];
+    size_t width = x_shape[3];
+
+    // Input: (N, C*r^2, H, W) -> Output: (N, C, H*r, W*r)
+    if (in_channels % (upscale_factor * upscale_factor) != 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t out_channels = in_channels / (upscale_factor * upscale_factor);
+    size_t out_height = height * upscale_factor;
+    size_t out_width = width * upscale_factor;
+
+    std::vector<size_t> expected_y_shape = {batch, out_channels, out_height, out_width};
+    if (y_shape != expected_y_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    PixelShuffleInfo info;
+    info.batch = batch;
+    info.in_channels = in_channels;
+    info.out_channels = out_channels;
+    info.height = height;
+    info.width = width;
+    info.upscale_factor = upscale_factor;
+    info.input_size = x_desc->numel();
+    info.output_size = y_desc->numel();
+
+    return utils::Result<PixelShuffleInfo>(std::move(info));
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    auto info_result = PixelShuffleInfo::create(x_desc, y_desc, upscale_factor);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void pixel_shuffle_impl(
+    const PixelShuffleInfo &info,
+    T *y,
+    const T *x) {
+
+    int r = info.upscale_factor;
+
+    // Input: (N, C*r^2, H, W)
+    // Output: (N, C, H*r, W*r)
+    for (size_t n = 0; n < info.batch; ++n) {
+        for (size_t c = 0; c < info.out_channels; ++c) {
+            for (size_t h = 0; h < info.height; ++h) {
+                for (size_t w = 0; w < info.width; ++w) {
+                    for (int i = 0; i < r; ++i) {
+                        for (int j = 0; j < r; ++j) {
+                            // Input channel index
+                            size_t in_c = c * r * r + i * r + j;
+                            // Input position
+                            size_t in_idx = ((n * info.in_channels + in_c) * info.height + h) * info.width + w;
+                            // Output position
+                            size_t out_h = h * r + i;
+                            size_t out_w = w * r + j;
+                            size_t out_idx = ((n * info.out_channels + c) * (info.height * r) + out_h) * (info.width * r) + out_w;
+                            y[out_idx] = x[in_idx];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        pixel_shuffle_impl<fp16_t>(_info, reinterpret_cast<fp16_t *>(y), reinterpret_cast<const fp16_t *>(x));
+        break;
+    case INFINI_DTYPE_BF16:
+        pixel_shuffle_impl<bf16_t>(_info, reinterpret_cast<bf16_t *>(y), reinterpret_cast<const bf16_t *>(x));
+        break;
+    case INFINI_DTYPE_F32:
+        pixel_shuffle_impl<float>(_info, reinterpret_cast<float *>(y), reinterpret_cast<const float *>(x));
+        break;
+    case INFINI_DTYPE_F64:
+        pixel_shuffle_impl<double>(_info, reinterpret_cast<double *>(y), reinterpret_cast<const double *>(x));
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::pixel_shuffle::cpu
diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h
new file mode 100644
index 000000000..d4a1a2b46
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h
@@ -0,0 +1,58 @@
+#ifndef __PIXEL_SHUFFLE_CPU_H__
+#define __PIXEL_SHUFFLE_CPU_H__
+
+#include "../../../operator.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <vector>
+
+namespace op::pixel_shuffle::cpu {
+
+struct PixelShuffleInfo {
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t height;
+    size_t width;
+    int upscale_factor;
+    size_t input_size;
+    size_t output_size;
+
+    static utils::Result<PixelShuffleInfo> create(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t y_desc,
+        int upscale_factor);
+};
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    PixelShuffleInfo _info;
+
+    Descriptor(infiniDtype_t dtype, PixelShuffleInfo info,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int upscale_factor);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::pixel_shuffle::cpu
+
+#endif // __PIXEL_SHUFFLE_CPU_H__
diff --git a/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh
new file mode 100644
index 000000000..758b2ba7a
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh
@@ -0,0 +1,95 @@
+#pragma once
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <type_traits>
+
+namespace op::cuda {
+
+template <typename T>
+__global__ void pixel_shuffle_kernel(
+    T *output,
+    const T *input,
+    size_t batch,
+    size_t out_channels,
+    size_t height,
+    size_t width,
+    int r) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = batch * out_channels * height * width;
+
+    if (idx >= total) return;
+
+    size_t n = idx / (out_channels * height * width);
+    size_t rem = idx % (out_channels * height * width);
+    size_t c = rem / (height * width);
+    rem = rem % (height * width);
+    size_t oh = rem / width;
+    size_t ow = rem % width;
+
+    // Calculate input indices
+    size_t w = ow / r;
+    size_t h = oh / r;
+    size_t i = oh % r;
+    size_t j = ow % r;
+    size_t in_c = c * r * r + i * r + j;
+
+    size_t in_idx = ((n * (out_channels * r * r) + in_c) * (height / r) + h) * (width / r) + w;
+    size_t out_idx = ((n * out_channels + c) * height + oh) * width + ow;
+
+    output[out_idx] = input[in_idx];
+}
+
+template <typename T>
+__global__ void pixel_shuffle_kernel_strided(
+    T *output,
+    const T *input,
+    size_t batch,
+    size_t out_channels,
+    size_t out_height,
+    size_t out_width,
+    int r,
+    ptrdiff_t x_stride0,
+    ptrdiff_t x_stride1,
+    ptrdiff_t x_stride2,
+    ptrdiff_t x_stride3,
+    ptrdiff_t y_stride0,
+    ptrdiff_t y_stride1,
+    ptrdiff_t y_stride2,
+    ptrdiff_t y_stride3) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = batch * out_channels * out_height * out_width;
+
+    if (idx >= total) return;
+
+    const size_t spatial = out_height * out_width;
+    const size_t chw = out_channels * spatial;
+
+    size_t n = idx / chw;
+    size_t rem = idx % chw;
+    size_t c = rem / spatial;
+    rem = rem % spatial;
+    size_t oh = rem / out_width;
+    size_t ow = rem % out_width;
+
+    const size_t upscale = static_cast<size_t>(r);
+    const size_t ih = oh / upscale;
+    const size_t iw = ow / upscale;
+    const size_t i = oh % upscale;
+    const size_t j = ow % upscale;
+    const size_t in_c = c * upscale * upscale + i * upscale + j;
+
+    const ptrdiff_t in_offset = static_cast<ptrdiff_t>(n) * x_stride0 +
+                                static_cast<ptrdiff_t>(in_c) * x_stride1 +
+                                static_cast<ptrdiff_t>(ih) * x_stride2 +
+                                static_cast<ptrdiff_t>(iw) * x_stride3;
+    const ptrdiff_t out_offset = static_cast<ptrdiff_t>(n) * y_stride0 +
+                                 static_cast<ptrdiff_t>(c) * y_stride1 +
+                                 static_cast<ptrdiff_t>(oh) * y_stride2 +
+                                 static_cast<ptrdiff_t>(ow) * y_stride3;
+
+    output[out_offset] = input[in_offset];
+}
+
+} // namespace op::cuda
diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h
new file mode 100644
index 000000000..d64dbc961
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h
@@ -0,0 +1,57 @@
+#ifndef __PIXEL_SHUFFLE_METAX_H__
+#define __PIXEL_SHUFFLE_METAX_H__
+
+#include "../../../operator.h"
+#include "../../../devices/metax/metax_common.h"
+
+namespace op::pixel_shuffle::metax {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t height;
+    size_t width;
+    int upscale_factor;
+    size_t input_size;
+    size_t output_size;
+
+    Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels,
+               size_t height, size_t width, int upscale_factor,
+               size_t input_size, size_t output_size,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          batch(batch),
+          in_channels(in_channels),
+          out_channels(out_channels),
+          height(height),
+          width(width),
+          upscale_factor(upscale_factor),
+          input_size(input_size),
+          output_size(output_size) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int upscale_factor);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::pixel_shuffle::metax
+
+#endif // __PIXEL_SHUFFLE_METAX_H__
diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca
new file mode 100644
index 000000000..4c9e5ca78
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca
@@ -0,0 +1,113 @@
+#include "pixel_shuffle_metax.h"
+#include "../cuda/kernel.cuh"
+#include "../../../utils.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::pixel_shuffle::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    if (upscale_factor <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 4) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t batch = x_shape[0];
+    size_t in_channels = x_shape[1];
+    size_t height = x_shape[2];
+    size_t width = x_shape[3];
+
+    if (in_channels % (upscale_factor * upscale_factor) != 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t out_channels = in_channels / (upscale_factor * upscale_factor);
+    size_t out_height = height * upscale_factor;
+    size_t out_width = width * upscale_factor;
+
+    std::vector<size_t> expected_y_shape = {batch, out_channels, out_height, out_width};
+    if (y_shape != expected_y_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels,
+                               height, width, upscale_factor,
+                               x_desc->numel(), y_desc->numel(),
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+    constexpr int BLOCK_SIZE = 256;
+    size_t total = output_size;
+    if (total == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::pixel_shuffle_kernel<half><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<half *>(y),
+            reinterpret_cast<const half *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::pixel_shuffle_kernel<cuda_bfloat16><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<cuda_bfloat16 *>(y),
+            reinterpret_cast<const cuda_bfloat16 *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::pixel_shuffle_kernel<float><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::pixel_shuffle_kernel<double><<<num_blocks, BLOCK_SIZE, 0, hc_stream>>>(
+            reinterpret_cast<double *>(y),
+            reinterpret_cast<const double *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::pixel_shuffle::metax
diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h
new file mode 100644
index 000000000..db1a6db4c
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h
@@ -0,0 +1,57 @@
+#ifndef __PIXEL_SHUFFLE_MOORE_H__
+#define __PIXEL_SHUFFLE_MOORE_H__
+
+#include "../../../operator.h"
+#include "../../../devices/moore/moore_common.h"
+
+namespace op::pixel_shuffle::moore {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t height;
+    size_t width;
+    int upscale_factor;
+    size_t input_size;
+    size_t output_size;
+
+    Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels,
+               size_t height, size_t width, int upscale_factor,
+               size_t input_size, size_t output_size,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          batch(batch),
+          in_channels(in_channels),
+          out_channels(out_channels),
+          height(height),
+          width(width),
+          upscale_factor(upscale_factor),
+          input_size(input_size),
+          output_size(output_size) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int upscale_factor);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::pixel_shuffle::moore
+
+#endif // __PIXEL_SHUFFLE_MOORE_H__
diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu
new file mode 100644
index 000000000..6fb6f9ef2
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu
@@ -0,0 +1,113 @@
+#include "pixel_shuffle_moore.h"
+#include "../cuda/kernel.cuh"
+#include "../../../utils.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::pixel_shuffle::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_OR_RETURN(y_desc->dtype() == dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    CHECK_OR_RETURN(x_desc->isContiguous() && y_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+    CHECK_OR_RETURN(!x_desc->hasBroadcastDim() && !y_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+    if (upscale_factor <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 4) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t batch = x_shape[0];
+    size_t in_channels = x_shape[1];
+    size_t height = x_shape[2];
+    size_t width = x_shape[3];
+
+    if (in_channels % (upscale_factor * upscale_factor) != 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t out_channels = in_channels / (upscale_factor * upscale_factor);
+    size_t out_height = height * upscale_factor;
+    size_t out_width = width * upscale_factor;
+
+    std::vector<size_t> expected_y_shape = {batch, out_channels, out_height, out_width};
+    if (y_shape != expected_y_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels,
+                               height, width, upscale_factor,
+                               x_desc->numel(), y_desc->numel(),
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    constexpr int BLOCK_SIZE = 256;
+    size_t total = output_size;
+    if (total == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::pixel_shuffle_kernel<half><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<half *>(y),
+            reinterpret_cast<const half *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::pixel_shuffle_kernel<cuda_bfloat16><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<cuda_bfloat16 *>(y),
+            reinterpret_cast<const cuda_bfloat16 *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::pixel_shuffle_kernel<float><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::pixel_shuffle_kernel<double><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>(
+            reinterpret_cast<double *>(y),
+            reinterpret_cast<const double *>(x),
+            batch, out_channels, height * upscale_factor, width * upscale_factor,
+            upscale_factor);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::pixel_shuffle::moore
diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu
new file mode 100644
index 000000000..32b36e226
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu
@@ -0,0 +1,129 @@
+#include "pixel_shuffle_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../utils.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <array>
+
+namespace op::pixel_shuffle::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+    auto dtype = x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    if (y_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    if (upscale_factor <= 0) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto x_shape = x_desc->shape();
+    auto y_shape = y_desc->shape();
+
+    if (x_shape.size() != 4 || y_shape.size() != 4) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+    if (x_desc->hasBroadcastDim() || y_desc->hasBroadcastDim()) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    size_t batch = x_shape[0];
+    size_t in_channels = x_shape[1];
+    size_t height = x_shape[2];
+    size_t width = x_shape[3];
+
+    if (in_channels % (upscale_factor * upscale_factor) != 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    size_t out_channels = in_channels / (upscale_factor * upscale_factor);
+    size_t out_height = height * upscale_factor;
+    size_t out_width = width * upscale_factor;
+
+    std::vector<size_t> expected_y_shape = {batch, out_channels, out_height, out_width};
+    if (y_shape != expected_y_shape) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    std::array<ptrdiff_t, 4> x_strides = {x_desc->stride(0), x_desc->stride(1), x_desc->stride(2), x_desc->stride(3)};
+    std::array<ptrdiff_t, 4> y_strides = {y_desc->stride(0), y_desc->stride(1), y_desc->stride(2), y_desc->stride(3)};
+
+    *desc_ptr = new Descriptor(dtype, batch, in_channels, out_channels,
+                               height, width, upscale_factor,
+                               x_desc->numel(), y_desc->numel(),
+                               x_strides, y_strides,
+                               handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    constexpr int BLOCK_SIZE = 256;
+    const size_t out_height = height * static_cast<size_t>(upscale_factor);
+    const size_t out_width = width * static_cast<size_t>(upscale_factor);
+    const size_t total = output_size;
+    if (total == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+    int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::pixel_shuffle_kernel_strided<half><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<half *>(y),
+            reinterpret_cast<const half *>(x),
+            batch, out_channels, out_height, out_width,
+            upscale_factor,
+            x_strides[0], x_strides[1], x_strides[2], x_strides[3],
+            y_strides[0], y_strides[1], y_strides[2], y_strides[3]);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda::pixel_shuffle_kernel_strided<__nv_bfloat16><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<__nv_bfloat16 *>(y),
+            reinterpret_cast<const __nv_bfloat16 *>(x),
+            batch, out_channels, out_height, out_width,
+            upscale_factor,
+            x_strides[0], x_strides[1], x_strides[2], x_strides[3],
+            y_strides[0], y_strides[1], y_strides[2], y_strides[3]);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda::pixel_shuffle_kernel_strided<float><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<float *>(y),
+            reinterpret_cast<const float *>(x),
+            batch, out_channels, out_height, out_width,
+            upscale_factor,
+            x_strides[0], x_strides[1], x_strides[2], x_strides[3],
+            y_strides[0], y_strides[1], y_strides[2], y_strides[3]);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda::pixel_shuffle_kernel_strided<double><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>(
+            reinterpret_cast<double *>(y),
+            reinterpret_cast<const double *>(x),
+            batch, out_channels, out_height, out_width,
+            upscale_factor,
+            x_strides[0], x_strides[1], x_strides[2], x_strides[3],
+            y_strides[0], y_strides[1], y_strides[2], y_strides[3]);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::pixel_shuffle::nvidia
diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh
new file mode 100644
index 000000000..1cd155cab
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh
@@ -0,0 +1,65 @@
+#ifndef __PIXEL_SHUFFLE_NVIDIA_H__
+#define __PIXEL_SHUFFLE_NVIDIA_H__
+
+#include "../../../operator.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include <array>
+#include <cstddef>
+
+namespace op::pixel_shuffle::nvidia {
+
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t height;
+    size_t width;
+    int upscale_factor;
+    size_t input_size;
+    size_t output_size;
+    std::array<ptrdiff_t, 4> x_strides;
+    std::array<ptrdiff_t, 4> y_strides;
+
+    Descriptor(infiniDtype_t dtype, size_t batch, size_t in_channels, size_t out_channels,
+               size_t height, size_t width, int upscale_factor,
+               size_t input_size, size_t output_size,
+               std::array<ptrdiff_t, 4> x_strides,
+               std::array<ptrdiff_t, 4> y_strides,
+               infiniDevice_t device_type, int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          batch(batch),
+          in_channels(in_channels),
+          out_channels(out_channels),
+          height(height),
+          width(width),
+          upscale_factor(upscale_factor),
+          input_size(input_size),
+          output_size(output_size),
+          x_strides(x_strides),
+          y_strides(y_strides) {}
+
+public:
+    ~Descriptor();
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int upscale_factor);
+
+    size_t workspaceSize() const { return 0; }
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        void *stream) const;
+};
+
+} // namespace op::pixel_shuffle::nvidia
+
+#endif // __PIXEL_SHUFFLE_NVIDIA_H__
diff --git a/src/infiniop/ops/pixel_shuffle/operator.cc b/src/infiniop/ops/pixel_shuffle/operator.cc
new file mode 100644
index 000000000..8147b0b2d
--- /dev/null
+++ b/src/infiniop/ops/pixel_shuffle/operator.cc
@@ -0,0 +1,159 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/pixel_shuffle.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/pixel_shuffle_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/pixel_shuffle_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/pixel_shuffle_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/pixel_shuffle_moore.h"
+#endif
+
+__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(
+    infiniopHandle_t handle,
+    infiniopPixelShuffleDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int upscale_factor) {
+
+#define CREATE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        return op::pixel_shuffle::NAMESPACE::Descriptor::create(                        \
+            handle,                                                                      \
+            reinterpret_cast<op::pixel_shuffle::NAMESPACE::Descriptor **>(desc_ptr),   \
+            y_desc,                                                                      \
+            x_desc,                                                                      \
+            upscale_factor)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                          \
+    case CASE:                                                                                        \
+        *size = reinterpret_cast<op::pixel_shuffle::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C __export infiniStatus_t infiniopPixelShuffle(
+    infiniopPixelShuffleDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                              \
+        return reinterpret_cast<const op::pixel_shuffle::NAMESPACE::Descriptor *>(desc)   \
+            ->calculate(workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C __export infiniStatus_t
+infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                              \
+    case CASE:                                                                               \
+        delete reinterpret_cast<const op::pixel_shuffle::NAMESPACE::Descriptor *>(desc);   \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}

From 2647cf913b4a462b99517fdd2f15ff9aa8b11bf7 Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Tue, 7 Apr 2026 16:57:04 +0800
Subject: [PATCH 2/2] issue/1031 fix T1-1-41

---
 include/infiniop/ops/erf.h                    |  24 +--
 include/infiniop/ops/erfc.h                   |  24 +--
 include/infiniop/ops/erfinv.h                 |  24 +--
 include/infiniop/ops/matrix_power.h           |  26 +--
 include/infiniop/ops/pixel_shuffle.h          |  26 +--
 src/infiniop/ops/erf/cuda/kernel.cuh          |   5 +-
 src/infiniop/ops/erf/moore/erf_moore.mu       |  10 +-
 src/infiniop/ops/erf/moore/erf_moore_kernel.h |  36 ----
 src/infiniop/ops/erf/operator.cc              |  34 ++--
 src/infiniop/ops/erfc/cuda/kernel.cuh         |   5 +-
 src/infiniop/ops/erfc/moore/erfc_moore.mu     |  10 +-
 .../ops/erfc/moore/erfc_moore_kernel.h        |  36 ----
 src/infiniop/ops/erfc/operator.cc             |  30 ++--
 src/infiniop/ops/erfinv/cpu/erfinv_cpu.h      |  22 ++-
 src/infiniop/ops/erfinv/cuda/kernel.cuh       |  85 +++++----
 src/infiniop/ops/erfinv/moore/erfinv_moore.mu |  10 +-
 .../ops/erfinv/moore/erfinv_moore_kernel.h    |  72 --------
 src/infiniop/ops/erfinv/operator.cc           |  14 +-
 .../ops/matrix_power/cpu/matrix_power_cpu.cc  |   4 +-
 .../ops/matrix_power/cpu/matrix_power_cpu.h   |   6 +-
 .../matrix_power/metax/matrix_power_metax.h   |   1 -
 .../metax/matrix_power_metax.maca             |   8 +-
 .../matrix_power/moore/matrix_power_moore.h   |   1 -
 .../matrix_power/moore/matrix_power_moore.mu  |   8 +-
 .../nvidia/matrix_power_nvidia.cu             |   6 +-
 .../nvidia/matrix_power_nvidia.cuh            |   1 -
 src/infiniop/ops/matrix_power/operator.cc     |  40 ++---
 .../pixel_shuffle/cpu/pixel_shuffle_cpu.cc    |   2 +-
 .../ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h |   2 +-
 .../ops/pixel_shuffle/cuda/kernel.cuh         |  19 +-
 .../pixel_shuffle/metax/pixel_shuffle_metax.h |   1 -
 .../metax/pixel_shuffle_metax.maca            |   9 +-
 .../pixel_shuffle/moore/pixel_shuffle_moore.h |   1 -
 .../moore/pixel_shuffle_moore.mu              |   9 +-
 .../nvidia/pixel_shuffle_nvidia.cu            |   8 +-
 .../nvidia/pixel_shuffle_nvidia.cuh           |   1 -
 src/infiniop/ops/pixel_shuffle/operator.cc    |  38 ++--
 test/infiniop/erf.py                          | 118 +++++++++++++
 test/infiniop/erfc.py                         | 117 +++++++++++++
 test/infiniop/erfinv.py                       | 118 +++++++++++++
 test/infiniop/libinfiniop/op_register.py      | 162 ++++++++++++++++++
 test/infiniop/matrix_power.py                 | 104 +++++++++++
 test/infiniop/pixel_shuffle.py                | 114 ++++++++++++
 43 files changed, 993 insertions(+), 398 deletions(-)
 delete mode 100644 src/infiniop/ops/erf/moore/erf_moore_kernel.h
 delete mode 100644 src/infiniop/ops/erfc/moore/erfc_moore_kernel.h
 delete mode 100644 src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h
 create mode 100644 test/infiniop/erf.py
 create mode 100644 test/infiniop/erfc.py
 create mode 100644 test/infiniop/erfinv.py
 create mode 100644 test/infiniop/matrix_power.py
 create mode 100644 test/infiniop/pixel_shuffle.py

diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h
index 8cbb8fb74..8786b31a3 100644
--- a/include/infiniop/ops/erf.h
+++ b/include/infiniop/ops/erf.h
@@ -5,20 +5,20 @@
 
 typedef struct InfiniopDescriptor *infiniopErfDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
-                                                        infiniopErfDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
+__INFINI_C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
+                                                               infiniopErfDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t y,
+                                                               infiniopTensorDescriptor_t x);
 
-__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
+__INFINI_C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *y,
+                                               const void *x,
+                                               void *stream);
 
-__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);
 
 #endif
diff --git a/include/infiniop/ops/erfc.h b/include/infiniop/ops/erfc.h
index 6454573bc..6bceebe35 100644
--- a/include/infiniop/ops/erfc.h
+++ b/include/infiniop/ops/erfc.h
@@ -5,20 +5,20 @@
 
 typedef struct InfiniopDescriptor *infiniopErfcDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle,
-                                                         infiniopErfcDescriptor_t *desc_ptr,
-                                                         infiniopTensorDescriptor_t y,
-                                                         infiniopTensorDescriptor_t x);
+__INFINI_C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle,
+                                                                infiniopErfcDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x);
 
-__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc,
-                                         void *workspace,
-                                         size_t workspace_size,
-                                         void *y,
-                                         const void *x,
-                                         void *stream);
+__INFINI_C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                const void *x,
+                                                void *stream);
 
-__C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc);
 
 #endif
diff --git a/include/infiniop/ops/erfinv.h b/include/infiniop/ops/erfinv.h
index 79bc09f22..b14975253 100644
--- a/include/infiniop/ops/erfinv.h
+++ b/include/infiniop/ops/erfinv.h
@@ -5,20 +5,20 @@
 
 typedef struct InfiniopDescriptor *infiniopErfinvDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle,
-                                                           infiniopErfinvDescriptor_t *desc_ptr,
-                                                           infiniopTensorDescriptor_t y,
-                                                           infiniopTensorDescriptor_t x);
+__INFINI_C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle,
+                                                                  infiniopErfinvDescriptor_t *desc_ptr,
+                                                                  infiniopTensorDescriptor_t y,
+                                                                  infiniopTensorDescriptor_t x);
 
-__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc,
-                                           void *workspace,
-                                           size_t workspace_size,
-                                           void *y,
-                                           const void *x,
-                                           void *stream);
+__INFINI_C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc,
+                                                  void *workspace,
+                                                  size_t workspace_size,
+                                                  void *y,
+                                                  const void *x,
+                                                  void *stream);
 
-__C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc);
 
 #endif
diff --git a/include/infiniop/ops/matrix_power.h b/include/infiniop/ops/matrix_power.h
index acd7c0c7e..639d3cf02 100644
--- a/include/infiniop/ops/matrix_power.h
+++ b/include/infiniop/ops/matrix_power.h
@@ -5,21 +5,21 @@
 
 typedef struct InfiniopDescriptor *infiniopMatrixPowerDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle,
-                                                                infiniopMatrixPowerDescriptor_t *desc_ptr,
-                                                                infiniopTensorDescriptor_t y,
-                                                                infiniopTensorDescriptor_t x,
-                                                                int n);
+__INFINI_C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle,
+                                                                       infiniopMatrixPowerDescriptor_t *desc_ptr,
+                                                                       infiniopTensorDescriptor_t y,
+                                                                       infiniopTensorDescriptor_t x,
+                                                                       int n);
 
-__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc,
-                                                void *workspace,
-                                                size_t workspace_size,
-                                                void *y,
-                                                const void *x,
-                                                void *stream);
+__INFINI_C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc,
+                                                       void *workspace,
+                                                       size_t workspace_size,
+                                                       void *y,
+                                                       const void *x,
+                                                       void *stream);
 
-__C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc);
 
 #endif
diff --git a/include/infiniop/ops/pixel_shuffle.h b/include/infiniop/ops/pixel_shuffle.h
index 941a91cfc..fac8eff2c 100644
--- a/include/infiniop/ops/pixel_shuffle.h
+++ b/include/infiniop/ops/pixel_shuffle.h
@@ -5,21 +5,21 @@
 
 typedef struct InfiniopDescriptor *infiniopPixelShuffleDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle,
-                                                                 infiniopPixelShuffleDescriptor_t *desc_ptr,
-                                                                 infiniopTensorDescriptor_t y,
-                                                                 infiniopTensorDescriptor_t x,
-                                                                 int upscale_factor);
+__INFINI_C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle,
+                                                                        infiniopPixelShuffleDescriptor_t *desc_ptr,
+                                                                        infiniopTensorDescriptor_t y,
+                                                                        infiniopTensorDescriptor_t x,
+                                                                        int upscale_factor);
 
-__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc,
-                                                 void *workspace,
-                                                 size_t workspace_size,
-                                                 void *y,
-                                                 const void *x,
-                                                 void *stream);
+__INFINI_C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc,
+                                                        void *workspace,
+                                                        size_t workspace_size,
+                                                        void *y,
+                                                        const void *x,
+                                                        void *stream);
 
-__C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc);
 
 #endif
diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh
index 9bd6cff21..8efe037e3 100644
--- a/src/infiniop/ops/erf/cuda/kernel.cuh
+++ b/src/infiniop/ops/erf/cuda/kernel.cuh
@@ -1,8 +1,5 @@
 #pragma once
 #include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
 #include <type_traits>
 
 namespace op::cuda {
@@ -15,7 +12,7 @@ struct ErfOp {
         if constexpr (std::is_same_v<T, float>) {
             return erff(x);
         } else if constexpr (std::is_same_v<T, double>) {
-            return erf(x);
+            return ::erf(x);
         } else {
             // For F16/BF16: promote to float, compute, then cast back
             float xf;
diff --git a/src/infiniop/ops/erf/moore/erf_moore.mu b/src/infiniop/ops/erf/moore/erf_moore.mu
index 1f717fa51..856764c70 100644
--- a/src/infiniop/ops/erf/moore/erf_moore.mu
+++ b/src/infiniop/ops/erf/moore/erf_moore.mu
@@ -2,7 +2,7 @@
 
 #include "../../../elementwise/moore/elementwise_moore.h"
 
-#include "erf_moore_kernel.h"
+#include "../cuda/kernel.cuh"
 
 namespace op::erf::moore {
 
@@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, moore::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, moore::ErfOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, moore::ErfOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, moore::ErfOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
diff --git a/src/infiniop/ops/erf/moore/erf_moore_kernel.h b/src/infiniop/ops/erf/moore/erf_moore_kernel.h
deleted file mode 100644
index 8ddc9d5f1..000000000
--- a/src/infiniop/ops/erf/moore/erf_moore_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef __ERF_MOORE_KERNEL_H__
-#define __ERF_MOORE_KERNEL_H__
-
-#include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <type_traits>
-
-namespace op::erf::moore {
-
-typedef struct ErfOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float x0 = __low2float(x);
-            float x1 = __high2float(x);
-            return __floats2half2_rn(erff(x0), erff(x1));
-        } else if constexpr (std::is_same_v<T, half>) {
-            float xf = __half2float(x);
-            return __float2half(erff(xf));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float xf = __bfloat162float(x);
-            return __float2bfloat16_rn(erff(xf));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return erff(x);
-        } else { // double
-            return erf(x);
-        }
-    }
-} ErfOp;
-
-} // namespace op::erf::moore
-
-#endif // __ERF_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc
index f9b61e981..7ed264e33 100644
--- a/src/infiniop/ops/erf/operator.cc
+++ b/src/infiniop/ops/erf/operator.cc
@@ -15,18 +15,18 @@
 #include "moore/erf_moore.h"
 #endif
 
-__C __export infiniStatus_t infiniopCreateErfDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreateErfDescriptor(
     infiniopHandle_t handle,
     infiniopErfDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
     infiniopTensorDescriptor_t x_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::erf::NAMESPACE::Descriptor::create(                      \
-            handle,                                                          \
-            reinterpret_cast<op::erf::NAMESPACE::Descriptor **>(desc_ptr),  \
-            y_desc,                                                          \
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::erf::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::erf::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
             {x_desc})
 
     switch (handle->device) {
@@ -54,10 +54,10 @@ __C __export infiniStatus_t infiniopCreateErfDescriptor(
 #undef CREATE
 }
 
-__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) {
+__INFINI_C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
         *size = reinterpret_cast<op::erf::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -85,7 +85,7 @@ __C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C __export infiniStatus_t infiniopErf(
+__INFINI_C __export infiniStatus_t infiniopErf(
     infiniopErfDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -93,9 +93,9 @@ __C __export infiniStatus_t infiniopErf(
     const void *x,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc)  \
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, y, {x}, stream)
 
     switch (desc->device_type) {
@@ -123,11 +123,11 @@ __C __export infiniStatus_t infiniopErf(
 #undef CALCULATE
 }
 
-__C __export infiniStatus_t
+__INFINI_C __export infiniStatus_t
 infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
         delete reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
diff --git a/src/infiniop/ops/erfc/cuda/kernel.cuh b/src/infiniop/ops/erfc/cuda/kernel.cuh
index aae8efcee..5a448919d 100644
--- a/src/infiniop/ops/erfc/cuda/kernel.cuh
+++ b/src/infiniop/ops/erfc/cuda/kernel.cuh
@@ -1,8 +1,5 @@
 #pragma once
 #include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
 #include <type_traits>
 
 namespace op::cuda {
@@ -15,7 +12,7 @@ struct ErfcOp {
         if constexpr (std::is_same_v<T, float>) {
             return erfcf(x);
         } else if constexpr (std::is_same_v<T, double>) {
-            return erfc(x);
+            return ::erfc(x);
         } else {
             // For F16/BF16: promote to float, compute, then cast back
             float xf;
diff --git a/src/infiniop/ops/erfc/moore/erfc_moore.mu b/src/infiniop/ops/erfc/moore/erfc_moore.mu
index d1eaec1bf..d58243e50 100644
--- a/src/infiniop/ops/erfc/moore/erfc_moore.mu
+++ b/src/infiniop/ops/erfc/moore/erfc_moore.mu
@@ -2,7 +2,7 @@
 
 #include "../../../elementwise/moore/elementwise_moore.h"
 
-#include "erfc_moore_kernel.h"
+#include "../cuda/kernel.cuh"
 
 namespace op::erfc::moore {
 
@@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, moore::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfcOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, moore::ErfcOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfcOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, moore::ErfcOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfcOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, moore::ErfcOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfcOp, double>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
diff --git a/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h b/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h
deleted file mode 100644
index cd5225c3b..000000000
--- a/src/infiniop/ops/erfc/moore/erfc_moore_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef __ERFC_MOORE_KERNEL_H__
-#define __ERFC_MOORE_KERNEL_H__
-
-#include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <type_traits>
-
-namespace op::erfc::moore {
-
-typedef struct ErfcOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float x0 = __low2float(x);
-            float x1 = __high2float(x);
-            return __floats2half2_rn(erfcf(x0), erfcf(x1));
-        } else if constexpr (std::is_same_v<T, half>) {
-            float xf = __half2float(x);
-            return __float2half(erfcf(xf));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float xf = __bfloat162float(x);
-            return __float2bfloat16_rn(erfcf(xf));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return erfcf(x);
-        } else { // double
-            return erfc(x);
-        }
-    }
-} ErfcOp;
-
-} // namespace op::erfc::moore
-
-#endif // __ERFC_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/erfc/operator.cc b/src/infiniop/ops/erfc/operator.cc
index fa102c90c..ebd692349 100644
--- a/src/infiniop/ops/erfc/operator.cc
+++ b/src/infiniop/ops/erfc/operator.cc
@@ -15,18 +15,18 @@
 #include "moore/erfc_moore.h"
 #endif
 
-__C __export infiniStatus_t infiniopCreateErfcDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreateErfcDescriptor(
     infiniopHandle_t handle,
     infiniopErfcDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
     infiniopTensorDescriptor_t x_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
         return op::erfc::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
+            handle,                                                         \
             reinterpret_cast<op::erfc::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
+            y_desc,                                                         \
             {x_desc})
 
     switch (handle->device) {
@@ -54,10 +54,10 @@ __C __export infiniStatus_t infiniopCreateErfcDescriptor(
 #undef CREATE
 }
 
-__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size) {
+__INFINI_C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
         *size = reinterpret_cast<op::erfc::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -85,7 +85,7 @@ __C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C __export infiniStatus_t infiniopErfc(
+__INFINI_C __export infiniStatus_t infiniopErfc(
     infiniopErfcDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -93,9 +93,9 @@ __C __export infiniStatus_t infiniopErfc(
     const void *x,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::erfc::NAMESPACE::Descriptor *>(desc)  \
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::erfc::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, y, {x}, stream)
 
     switch (desc->device_type) {
@@ -123,11 +123,11 @@ __C __export infiniStatus_t infiniopErfc(
 #undef CALCULATE
 }
 
-__C __export infiniStatus_t
+__INFINI_C __export infiniStatus_t
 infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
         delete reinterpret_cast<const op::erfc::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
diff --git a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h
index 8cc218a1d..e7b1b723d 100644
--- a/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h
+++ b/src/infiniop/ops/erfinv/cpu/erfinv_cpu.h
@@ -13,10 +13,18 @@ namespace op::erfinv::cpu {
 template <typename T>
 T erfinv_impl(T x) {
     // Domain: x in (-1, 1)
-    if (x == 1.0) return std::numeric_limits<T>::infinity();
-    if (x == -1.0) return -std::numeric_limits<T>::infinity();
-    if (x > 1.0 || x < -1.0) return std::numeric_limits<T>::quiet_NaN();
-    if (x == 0.0) return 0.0;
+    if (x == 1.0) {
+        return std::numeric_limits<T>::infinity();
+    }
+    if (x == -1.0) {
+        return -std::numeric_limits<T>::infinity();
+    }
+    if (x > 1.0 || x < -1.0) {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+    if (x == 0.0) {
+        return 0.0;
+    }
 
     // Use Newton's method to solve erf(y) = x
     T y = x; // Initial guess
@@ -25,9 +33,11 @@ T erfinv_impl(T x) {
 
     for (int i = 0; i < max_iter; ++i) {
         T erf_y = std::erf(y);
-        T derf_dy = 2.0 / std::sqrt(3.14159265358979323846) * std::exp(-y * y);
+        T derf_dy = T(2.0) / T(std::sqrt(3.14159265358979323846) * std::exp(-y * y));
         T error = erf_y - x;
-        if (std::abs(error) < tol) break;
+        if (std::abs(error) < tol) {
+            break;
+        }
         y = y - error / derf_dy;
     }
     return y;
diff --git a/src/infiniop/ops/erfinv/cuda/kernel.cuh b/src/infiniop/ops/erfinv/cuda/kernel.cuh
index 4f4660e80..18025331a 100644
--- a/src/infiniop/ops/erfinv/cuda/kernel.cuh
+++ b/src/infiniop/ops/erfinv/cuda/kernel.cuh
@@ -1,37 +1,41 @@
 #pragma once
 #include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <math_constants.h>
+#include <limits>
 #include <type_traits>
 
 namespace op::cuda {
 
+constexpr float PI_F = 3.14159265358979323846f;
+constexpr double PI = 3.14159265358979323846;
+
 // Inverse error function.
-//
-// We use a Winitzki-style approximation for an initial guess, then refine with
-// a few Newton iterations. Starting with y=x converges poorly for x close to 1,
-// which appears frequently in test inputs (torch.rand in [0,1)).
 __device__ __forceinline__ float erfinv_impl(float x) {
-    if (x == 1.0f) return CUDART_INF_F;
-    if (x == -1.0f) return -CUDART_INF_F;
-    if (x > 1.0f || x < -1.0f) return CUDART_NAN_F;
-    if (x == 0.0f) return 0.0f;
+    if (x == 1.0f) {
+        return std::numeric_limits<float>::infinity();
+    }
+    if (x == -1.0f) {
+        return -std::numeric_limits<float>::infinity();
+    }
+    if (x > 1.0f || x < -1.0f) {
+        return std::numeric_limits<float>::quiet_NaN();
+    }
+    if (x == 0.0f) {
+        return 0.0f;
+    }
 
-    // Winitzki approximation (a = 0.147) for initial guess.
-    // See: https://arxiv.org/abs/math/0306301 (and common implementations).
+    // Winitzki approximation
     const float a = 0.147f;
-    const float ln = log1pf(-x * x); // ln(1 - x^2) <= 0
-    const float t = 2.0f / (CUDART_PI_F * a) + ln * 0.5f;
+    const float ln = log1pf(-x * x);
+    const float t = 2.0f / (PI_F * a) + ln * 0.5f;
+
     float inside = t * t - ln / a;
     inside = inside > 0.0f ? inside : 0.0f;
+
     float y0 = copysignf(sqrtf(sqrtf(inside) - t), x);
 
-    // Fast path: a few Newton steps in float.
-    // This is sufficient for most x and much faster than always refining in double.
     float y = y0;
-    const float sqrt_pi_f = 1.7724538509055159f; // sqrt(pi)
+    const float sqrt_pi_f = sqrtf(PI_F);
+
 #pragma unroll
     for (int i = 0; i < 4; ++i) {
         const float erf_y = erff(y);
@@ -39,17 +43,13 @@ __device__ __forceinline__ float erfinv_impl(float x) {
         y = y - (erf_y - x) / derf_dy;
     }
 
-    // Hybrid slow path: only for values extremely close to ±1 where float erf
-    // quantization can cause Newton iterations to stagnate, leading to noticeable
-    // absolute error in y (even if erff(y) == x in float).
-    //
-    // The threshold is chosen so the slow path is taken very rarely for typical
-    // random inputs, minimizing warp divergence and preserving performance.
+    // Slow path near |x| ~ 1
     const float ax = fabsf(x);
     if (1.0f - ax < 1e-4f) {
         const double xd = static_cast<double>(x);
         double yd = static_cast<double>(y);
-        const double sqrt_pi = 1.7724538509055159; // sqrt(pi)
+        const double sqrt_pi = sqrt(PI);
+
 #pragma unroll
         for (int i = 0; i < 4; ++i) {
             const double erf_y = erf(yd);
@@ -69,32 +69,47 @@ struct ErfinvOp {
     __device__ __forceinline__ T operator()(T x) const {
         if constexpr (std::is_same_v<T, float>) {
             return erfinv_impl(x);
+
         } else if constexpr (std::is_same_v<T, double>) {
-            // For double, use similar approach
-            if (x == 1.0) return CUDART_INF;
-            if (x == -1.0) return -CUDART_INF;
-            if (x > 1.0 || x < -1.0) return CUDART_NAN;
-            if (x == 0.0) return 0.0;
+            if (x == 1.0) {
+                return std::numeric_limits<double>::infinity();
+            }
+            if (x == -1.0) {
+                return -std::numeric_limits<double>::infinity();
+            }
+            if (x > 1.0 || x < -1.0) {
+                return std::numeric_limits<double>::quiet_NaN();
+            }
+            if (x == 0.0) {
+                return 0.0;
+            }
+
             const double a = 0.147;
             const double ln = log1p(-x * x);
-            const double t = 2.0 / (CUDART_PI * a) + ln * 0.5;
+            const double t = 2.0 / (PI * a) + ln * 0.5;
+
             double inside = t * t - ln / a;
             inside = inside > 0.0 ? inside : 0.0;
+
             double y = copysign(sqrt(sqrt(inside) - t), x);
 
             const int max_iter = 30;
             const double tol = 1e-14;
-            const double sqrt_pi = 1.7724538509055159;
+            const double sqrt_pi = sqrt(PI);
+
             for (int i = 0; i < max_iter; ++i) {
                 const double erf_y = erf(y);
                 const double error = erf_y - x;
-                if (fabs(error) < tol) break;
+                if (fabs(error) < tol) {
+                    break;
+                }
                 const double derf_dy = 2.0 / sqrt_pi * exp(-y * y);
                 y = y - error / derf_dy;
             }
             return y;
+
         } else {
-            // For F16/BF16: promote to float, compute, then cast back
+            // F16 / BF16 / other types
             float xf;
             if constexpr (std::is_same_v<T, half>) {
                 xf = __half2float(x);
diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore.mu b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu
index 54b5830ea..26febe69d 100644
--- a/src/infiniop/ops/erfinv/moore/erfinv_moore.mu
+++ b/src/infiniop/ops/erfinv/moore/erfinv_moore.mu
@@ -2,7 +2,7 @@
 
 #include "../../../elementwise/moore/elementwise_moore.h"
 
-#include "erfinv_moore_kernel.h"
+#include "../cuda/kernel.cuh"
 
 namespace op::erfinv::moore {
 
@@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, moore::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfinvOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, moore::ErfinvOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfinvOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, moore::ErfinvOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfinvOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, moore::ErfinvOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ErfinvOp, double>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
diff --git a/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h b/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h
deleted file mode 100644
index e3f3bb5f2..000000000
--- a/src/infiniop/ops/erfinv/moore/erfinv_moore_kernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef __ERFINV_MOORE_KERNEL_H__
-#define __ERFINV_MOORE_KERNEL_H__
-
-#include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <type_traits>
-
-namespace op::erfinv::moore {
-
-// Inverse error function using Newton's method
-template <typename T>
-__device__ __forceinline__ T erfinv_impl(T x) {
-    if (x >= 1.0f) return CUDART_INF_F;
-    if (x <= -1.0f) return -CUDART_INF_F;
-    if (x == 0.0f) return 0.0f;
-
-    T y = x;
-    const int max_iter = 10;
-    const T tol = 1e-10f;
-    const T sqrt_pi = 1.7724538509055159f;
-
-    for (int i = 0; i < max_iter; ++i) {
-        T erf_y = erff(y);
-        T derf_dy = 2.0f / sqrt_pi * expf(-y * y);
-        T error = erf_y - x;
-        if (fabsf(error) < tol) break;
-        y = y - error / derf_dy;
-    }
-    return y;
-}
-
-typedef struct ErfinvOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float x0 = __low2float(x);
-            float x1 = __high2float(x);
-            return __floats2half2_rn(erfinv_impl(x0), erfinv_impl(x1));
-        } else if constexpr (std::is_same_v<T, half>) {
-            float xf = __half2float(x);
-            return __float2half(erfinv_impl(xf));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float xf = __bfloat162float(x);
-            return __float2bfloat16_rn(erfinv_impl(xf));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return erfinv_impl(x);
-        } else { // double
-            if (x >= 1.0) return CUDART_INF;
-            if (x <= -1.0) return -CUDART_INF;
-            if (x == 0.0) return 0.0;
-            double y = x;
-            const int max_iter = 10;
-            const double tol = 1e-10;
-            const double sqrt_pi = 1.7724538509055159;
-            for (int i = 0; i < max_iter; ++i) {
-                double erf_y = erf(y);
-                double derf_dy = 2.0 / sqrt_pi * exp(-y * y);
-                double error = erf_y - x;
-                if (fabs(error) < tol) break;
-                y = y - error / derf_dy;
-            }
-            return y;
-        }
-    }
-} ErfinvOp;
-
-} // namespace op::erfinv::moore
-
-#endif // __ERFINV_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/erfinv/operator.cc b/src/infiniop/ops/erfinv/operator.cc
index c7c360bec..c0453562b 100644
--- a/src/infiniop/ops/erfinv/operator.cc
+++ b/src/infiniop/ops/erfinv/operator.cc
@@ -15,7 +15,7 @@
 #include "moore/erfinv_moore.h"
 #endif
 
-__C __export infiniStatus_t infiniopCreateErfinvDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreateErfinvDescriptor(
     infiniopHandle_t handle,
     infiniopErfinvDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
@@ -23,7 +23,7 @@ __C __export infiniStatus_t infiniopCreateErfinvDescriptor(
 
 #define CREATE(CASE, NAMESPACE)                                               \
     case CASE:                                                                \
-        return op::erfinv::NAMESPACE::Descriptor::create(                    \
+        return op::erfinv::NAMESPACE::Descriptor::create(                     \
             handle,                                                           \
             reinterpret_cast<op::erfinv::NAMESPACE::Descriptor **>(desc_ptr), \
             y_desc,                                                           \
@@ -54,10 +54,10 @@ __C __export infiniStatus_t infiniopCreateErfinvDescriptor(
 #undef CREATE
 }
 
-__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size) {
+__INFINI_C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                   \
-    case CASE:                                                                                 \
+#define GET(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                \
         *size = reinterpret_cast<op::erfinv::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -85,7 +85,7 @@ __C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescrip
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C __export infiniStatus_t infiniopErfinv(
+__INFINI_C __export infiniStatus_t infiniopErfinv(
     infiniopErfinvDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -123,7 +123,7 @@ __C __export infiniStatus_t infiniopErfinv(
 #undef CALCULATE
 }
 
-__C __export infiniStatus_t
+__INFINI_C __export infiniStatus_t
 infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc) {
 
 #define DELETE(CASE, NAMESPACE)                                                   \
diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc
index 22a792012..6c2fe60cd 100644
--- a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc
+++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.cc
@@ -1,7 +1,7 @@
 #include "matrix_power_cpu.h"
-#include "../../../utils.h"
-#include <cstring>
+#include "../../../tensor.h"
 #include <algorithm>
+#include <cstring>
 
 namespace op::matrix_power::cpu {
 
diff --git a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h
index 9c6f2ebc1..06a14c341 100644
--- a/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h
+++ b/src/infiniop/ops/matrix_power/cpu/matrix_power_cpu.h
@@ -1,15 +1,15 @@
 #ifndef __MATRIX_POWER_CPU_H__
 #define __MATRIX_POWER_CPU_H__
 
-#include "../../../operator.h"
 #include "../../../devices/cpu/common_cpu.h"
+#include "../../../operator.h"
 #include <vector>
 
 namespace op::matrix_power::cpu {
 
 struct MatrixPowerInfo {
-    size_t matrix_size;  // N x N matrix
-    size_t n;            // Power
+    size_t matrix_size; // N x N matrix
+    size_t n;           // Power
     size_t input_size;
     size_t output_size;
 
diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h
index 02ff4b14d..43e561f66 100644
--- a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h
+++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.h
@@ -2,7 +2,6 @@
 #define __MATRIX_POWER_METAX_H__
 
 #include "../../../operator.h"
-#include "../../../devices/metax/metax_common.h"
 
 namespace op::matrix_power::metax {
 
diff --git a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca
index 0e7f58f19..221757141 100644
--- a/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca
+++ b/src/infiniop/ops/matrix_power/metax/matrix_power_metax.maca
@@ -1,9 +1,9 @@
-#include "matrix_power_metax.h"
-#include "../../../utils.h"
+#include "../../../devices/metax/metax_common.h"
 #include "../../../devices/metax/metax_kernel_common.h"
-#include <cuda_runtime.h>
-#include <vector>
+#include "../../../tensor.h"
+#include "matrix_power_metax.h"
 #include <cstring>
+#include <vector>
 
 namespace op::matrix_power::metax {
 
diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h
index a58428a6e..42dd5c180 100644
--- a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h
+++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.h
@@ -2,7 +2,6 @@
 #define __MATRIX_POWER_MOORE_H__
 
 #include "../../../operator.h"
-#include "../../../devices/moore/moore_common.h"
 
 namespace op::matrix_power::moore {
 
diff --git a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu
index 532480955..938ad5c0a 100644
--- a/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu
+++ b/src/infiniop/ops/matrix_power/moore/matrix_power_moore.mu
@@ -1,9 +1,9 @@
-#include "matrix_power_moore.h"
-#include "../../../utils.h"
+#include "../../../devices/moore/moore_common.h"
 #include "../../../devices/moore/moore_kernel_common.h"
-#include <cuda_runtime.h>
-#include <vector>
+#include "../../../tensor.h"
+#include "matrix_power_moore.h"
 #include <cstring>
+#include <vector>
 
 namespace op::matrix_power::moore {
 
diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu
index e6164c8ca..d728238c1 100644
--- a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu
+++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cu
@@ -1,9 +1,7 @@
-#include "matrix_power_nvidia.cuh"
-#include "../../../utils.h"
 #include "../../../devices/nvidia/nvidia_handle.cuh"
 #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
+#include "../../../tensor.h"
+#include "matrix_power_nvidia.cuh"
 #include <limits>
 #include <utility>
 
diff --git a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh
index ebe0ecbdf..d11c6914e 100644
--- a/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh
+++ b/src/infiniop/ops/matrix_power/nvidia/matrix_power_nvidia.cuh
@@ -2,7 +2,6 @@
 #define __MATRIX_POWER_NVIDIA_H__
 
 #include "../../../operator.h"
-#include "../../../devices/nvidia/nvidia_common.cuh"
 #include <cstddef>
 
 namespace op::matrix_power::nvidia {
diff --git a/src/infiniop/ops/matrix_power/operator.cc b/src/infiniop/ops/matrix_power/operator.cc
index d26e26fd1..79a81077a 100644
--- a/src/infiniop/ops/matrix_power/operator.cc
+++ b/src/infiniop/ops/matrix_power/operator.cc
@@ -15,20 +15,20 @@
 #include "moore/matrix_power_moore.h"
 #endif
 
-__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(
     infiniopHandle_t handle,
     infiniopMatrixPowerDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
     infiniopTensorDescriptor_t x_desc,
     int n) {
 
-#define CREATE(CASE, NAMESPACE)                                                              \
-    case CASE:                                                                               \
-        return op::matrix_power::NAMESPACE::Descriptor::create(                              \
-            handle,                                                                          \
-            reinterpret_cast<op::matrix_power::NAMESPACE::Descriptor **>(desc_ptr),         \
-            y_desc,                                                                          \
-            x_desc,                                                                          \
+#define CREATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                      \
+        return op::matrix_power::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                 \
+            reinterpret_cast<op::matrix_power::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                                 \
+            x_desc,                                                                 \
             n)
 
     switch (handle->device) {
@@ -56,11 +56,11 @@ __C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(
 #undef CREATE
 }
 
-__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size) {
+__INFINI_C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                              \
-    case CASE:                                                                                            \
-        *size = reinterpret_cast<op::matrix_power::NAMESPACE::Descriptor *>(desc)->workspaceSize();     \
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::matrix_power::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
     switch (desc->device_type) {
@@ -87,7 +87,7 @@ __C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPo
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C __export infiniStatus_t infiniopMatrixPower(
+__INFINI_C __export infiniStatus_t infiniopMatrixPower(
     infiniopMatrixPowerDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -95,9 +95,9 @@ __C __export infiniStatus_t infiniopMatrixPower(
     const void *x,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                              \
-    case CASE:                                                                                  \
-        return reinterpret_cast<const op::matrix_power::NAMESPACE::Descriptor *>(desc)         \
+#define CALCULATE(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                         \
+        return reinterpret_cast<const op::matrix_power::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, y, x, stream)
 
     switch (desc->device_type) {
@@ -125,12 +125,12 @@ __C __export infiniStatus_t infiniopMatrixPower(
 #undef CALCULATE
 }
 
-__C __export infiniStatus_t
+__INFINI_C __export infiniStatus_t
 infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                                  \
-    case CASE:                                                                                   \
-        delete reinterpret_cast<const op::matrix_power::NAMESPACE::Descriptor *>(desc);        \
+#define DELETE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                          \
+        delete reinterpret_cast<const op::matrix_power::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc
index 19c1de74b..15d0d6906 100644
--- a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc
+++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.cc
@@ -1,5 +1,5 @@
 #include "pixel_shuffle_cpu.h"
-#include "../../../utils.h"
+#include "../../../tensor.h"
 
 namespace op::pixel_shuffle::cpu {
 
diff --git a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h
index d4a1a2b46..2fa5e00fe 100644
--- a/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h
+++ b/src/infiniop/ops/pixel_shuffle/cpu/pixel_shuffle_cpu.h
@@ -1,8 +1,8 @@
 #ifndef __PIXEL_SHUFFLE_CPU_H__
 #define __PIXEL_SHUFFLE_CPU_H__
 
-#include "../../../operator.h"
 #include "../../../devices/cpu/common_cpu.h"
+#include "../../../operator.h"
 #include <vector>
 
 namespace op::pixel_shuffle::cpu {
diff --git a/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh
index 758b2ba7a..874b29cce 100644
--- a/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh
+++ b/src/infiniop/ops/pixel_shuffle/cuda/kernel.cuh
@@ -1,6 +1,5 @@
 #pragma once
 #include <cstddef>
-#include <cuda_runtime.h>
 #include <type_traits>
 
 namespace op::cuda {
@@ -18,7 +17,9 @@ __global__ void pixel_shuffle_kernel(
     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     size_t total = batch * out_channels * height * width;
 
-    if (idx >= total) return;
+    if (idx >= total) {
+        return;
+    }
 
     size_t n = idx / (out_channels * height * width);
     size_t rem = idx % (out_channels * height * width);
@@ -61,7 +62,9 @@ __global__ void pixel_shuffle_kernel_strided(
     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     size_t total = batch * out_channels * out_height * out_width;
 
-    if (idx >= total) return;
+    if (idx >= total) {
+        return;
+    }
 
     const size_t spatial = out_height * out_width;
     const size_t chw = out_channels * spatial;
@@ -80,14 +83,8 @@ __global__ void pixel_shuffle_kernel_strided(
     const size_t j = ow % upscale;
     const size_t in_c = c * upscale * upscale + i * upscale + j;
 
-    const ptrdiff_t in_offset = static_cast<ptrdiff_t>(n) * x_stride0 +
-                                static_cast<ptrdiff_t>(in_c) * x_stride1 +
-                                static_cast<ptrdiff_t>(ih) * x_stride2 +
-                                static_cast<ptrdiff_t>(iw) * x_stride3;
-    const ptrdiff_t out_offset = static_cast<ptrdiff_t>(n) * y_stride0 +
-                                 static_cast<ptrdiff_t>(c) * y_stride1 +
-                                 static_cast<ptrdiff_t>(oh) * y_stride2 +
-                                 static_cast<ptrdiff_t>(ow) * y_stride3;
+    const ptrdiff_t in_offset = static_cast<ptrdiff_t>(n) * x_stride0 + static_cast<ptrdiff_t>(in_c) * x_stride1 + static_cast<ptrdiff_t>(ih) * x_stride2 + static_cast<ptrdiff_t>(iw) * x_stride3;
+    const ptrdiff_t out_offset = static_cast<ptrdiff_t>(n) * y_stride0 + static_cast<ptrdiff_t>(c) * y_stride1 + static_cast<ptrdiff_t>(oh) * y_stride2 + static_cast<ptrdiff_t>(ow) * y_stride3;
 
     output[out_offset] = input[in_offset];
 }
diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h
index d64dbc961..34f216655 100644
--- a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h
+++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.h
@@ -2,7 +2,6 @@
 #define __PIXEL_SHUFFLE_METAX_H__
 
 #include "../../../operator.h"
-#include "../../../devices/metax/metax_common.h"
 
 namespace op::pixel_shuffle::metax {
 
diff --git a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca
index 4c9e5ca78..3c8afd6f2 100644
--- a/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca
+++ b/src/infiniop/ops/pixel_shuffle/metax/pixel_shuffle_metax.maca
@@ -1,9 +1,8 @@
-#include "pixel_shuffle_metax.h"
-#include "../cuda/kernel.cuh"
-#include "../../../utils.h"
+#include "../../../devices/metax/metax_common.h"
 #include "../../../devices/metax/metax_kernel_common.h"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+#include "pixel_shuffle_metax.h"
 
 namespace op::pixel_shuffle::metax {
 
diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h
index db1a6db4c..e47a3edb2 100644
--- a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h
+++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.h
@@ -2,7 +2,6 @@
 #define __PIXEL_SHUFFLE_MOORE_H__
 
 #include "../../../operator.h"
-#include "../../../devices/moore/moore_common.h"
 
 namespace op::pixel_shuffle::moore {
 
diff --git a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu
index 6fb6f9ef2..5d2c113d6 100644
--- a/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu
+++ b/src/infiniop/ops/pixel_shuffle/moore/pixel_shuffle_moore.mu
@@ -1,9 +1,8 @@
-#include "pixel_shuffle_moore.h"
-#include "../cuda/kernel.cuh"
-#include "../../../utils.h"
+#include "../../../devices/moore/moore_common.h"
 #include "../../../devices/moore/moore_kernel_common.h"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../tensor.h"
+#include "../cuda/kernel.cuh"
+#include "pixel_shuffle_moore.h"
 
 namespace op::pixel_shuffle::moore {
 
diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu
index 32b36e226..af2d006ae 100644
--- a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu
+++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cu
@@ -1,8 +1,8 @@
-#include "pixel_shuffle_nvidia.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../tensor.h"
 #include "../cuda/kernel.cuh"
-#include "../../../utils.h"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "pixel_shuffle_nvidia.cuh"
 #include <array>
 
 namespace op::pixel_shuffle::nvidia {
diff --git a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh
index 1cd155cab..a5c3b7506 100644
--- a/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh
+++ b/src/infiniop/ops/pixel_shuffle/nvidia/pixel_shuffle_nvidia.cuh
@@ -2,7 +2,6 @@
 #define __PIXEL_SHUFFLE_NVIDIA_H__
 
 #include "../../../operator.h"
-#include "../../../devices/nvidia/nvidia_common.cuh"
 #include <array>
 #include <cstddef>
 
diff --git a/src/infiniop/ops/pixel_shuffle/operator.cc b/src/infiniop/ops/pixel_shuffle/operator.cc
index 8147b0b2d..3092787ad 100644
--- a/src/infiniop/ops/pixel_shuffle/operator.cc
+++ b/src/infiniop/ops/pixel_shuffle/operator.cc
@@ -15,20 +15,20 @@
 #include "moore/pixel_shuffle_moore.h"
 #endif
 
-__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(
     infiniopHandle_t handle,
     infiniopPixelShuffleDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
     infiniopTensorDescriptor_t x_desc,
     int upscale_factor) {
 
-#define CREATE(CASE, NAMESPACE)                                                          \
-    case CASE:                                                                           \
-        return op::pixel_shuffle::NAMESPACE::Descriptor::create(                        \
-            handle,                                                                      \
-            reinterpret_cast<op::pixel_shuffle::NAMESPACE::Descriptor **>(desc_ptr),   \
-            y_desc,                                                                      \
-            x_desc,                                                                      \
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::pixel_shuffle::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::pixel_shuffle::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                                  \
+            x_desc,                                                                  \
             upscale_factor)
 
     switch (handle->device) {
@@ -56,10 +56,10 @@ __C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(
 #undef CREATE
 }
 
-__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size) {
+__INFINI_C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                          \
-    case CASE:                                                                                        \
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
         *size = reinterpret_cast<op::pixel_shuffle::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -87,7 +87,7 @@ __C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelSh
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
-__C __export infiniStatus_t infiniopPixelShuffle(
+__INFINI_C __export infiniStatus_t infiniopPixelShuffle(
     infiniopPixelShuffleDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -95,9 +95,9 @@ __C __export infiniStatus_t infiniopPixelShuffle(
     const void *x,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                          \
-    case CASE:                                                                              \
-        return reinterpret_cast<const op::pixel_shuffle::NAMESPACE::Descriptor *>(desc)   \
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::pixel_shuffle::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, y, x, stream)
 
     switch (desc->device_type) {
@@ -125,12 +125,12 @@ __C __export infiniStatus_t infiniopPixelShuffle(
 #undef CALCULATE
 }
 
-__C __export infiniStatus_t
+__INFINI_C __export infiniStatus_t
 infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                              \
-    case CASE:                                                                               \
-        delete reinterpret_cast<const op::pixel_shuffle::NAMESPACE::Descriptor *>(desc);   \
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::pixel_shuffle::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
     switch (desc->device_type) {
diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py
new file mode 100644
index 000000000..733376b97
--- /dev/null
+++ b/test/infiniop/erf.py
@@ -0,0 +1,118 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import *
+from enum import Enum, auto
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+_TEST_CASES_ = [
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None),
+]
+
+_TEST_CASES = _TEST_CASES_
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    input = TestTensor(shape, input_stride, dtype, device)
+    output = TestTensor(shape, output_stride, dtype, device)
+
+    output.update_torch_tensor(torch.erf(input.torch_tensor()))
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateErfDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetErfWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopErf(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.erf(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/erfc.py b/test/infiniop/erfc.py
new file mode 100644
index 000000000..b2424496d
--- /dev/null
+++ b/test/infiniop/erfc.py
@@ -0,0 +1,117 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import *
+from enum import Enum, auto
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+_TEST_CASES_ = [
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None),
+]
+
+_TEST_CASES = _TEST_CASES_
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Erfc on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    input = TestTensor(shape, input_stride, dtype, device)
+    output = TestTensor(shape, output_stride, dtype, device)
+
+    output.update_torch_tensor(torch.erfc(input.torch_tensor()))
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateErfcDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetErfcWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopErfc(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.erfc(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyErfcDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/erfinv.py b/test/infiniop/erfinv.py
new file mode 100644
index 000000000..1979ee9c1
--- /dev/null
+++ b/test/infiniop/erfinv.py
@@ -0,0 +1,118 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import *
+from enum import Enum, auto
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+_TEST_CASES_ = [
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None),
+]
+
+_TEST_CASES = _TEST_CASES_
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Erfinv on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    input = TestTensor(shape, input_stride, dtype, device, scale=1.8, bias=-0.9)
+
+    output = TestTensor(shape, output_stride, dtype, device)
+    output.update_torch_tensor(torch.erfinv(input.torch_tensor()))
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateErfinvDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetErfinvWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopErfinv(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.erfinv(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyErfinvDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index cd3ad1b82..3675ca8d1 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -1753,3 +1753,165 @@ def silu_and_mul(lib):
     lib.infiniopDestroySiluAndMulDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+
+@OpRegister.operator
+def erf(lib):
+    lib.infiniopCreateErfDescriptor.restype = c_int32
+    lib.infiniopCreateErfDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetErfWorkspaceSize.restype = c_int32
+    lib.infiniopGetErfWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopErf.restype = c_int32
+    lib.infiniopErf.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyErfDescriptor.restype = c_int32
+    lib.infiniopDestroyErfDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def erfc(lib):
+    lib.infiniopCreateErfcDescriptor.restype = c_int32
+    lib.infiniopCreateErfcDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetErfcWorkspaceSize.restype = c_int32
+    lib.infiniopGetErfcWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopErfc.restype = c_int32
+    lib.infiniopErfc.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyErfcDescriptor.restype = c_int32
+    lib.infiniopDestroyErfcDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def erfinv(lib):
+    lib.infiniopCreateErfinvDescriptor.restype = c_int32
+    lib.infiniopCreateErfinvDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetErfinvWorkspaceSize.restype = c_int32
+    lib.infiniopGetErfinvWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopErfinv.restype = c_int32
+    lib.infiniopErfinv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyErfinvDescriptor.restype = c_int32
+    lib.infiniopDestroyErfinvDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def matrix_power(lib):
+    lib.infiniopCreateMatrixPowerDescriptor.restype = c_int32
+    lib.infiniopCreateMatrixPowerDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+
+    lib.infiniopGetMatrixPowerWorkspaceSize.restype = c_int32
+    lib.infiniopGetMatrixPowerWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMatrixPower.restype = c_int32
+    lib.infiniopMatrixPower.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMatrixPowerDescriptor.restype = c_int32
+    lib.infiniopDestroyMatrixPowerDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def pixel_shuffle(lib):
+    lib.infiniopCreatePixelShuffleDescriptor.restype = c_int32
+    lib.infiniopCreatePixelShuffleDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+
+    lib.infiniopGetPixelShuffleWorkspaceSize.restype = c_int32
+    lib.infiniopGetPixelShuffleWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPixelShuffle.restype = c_int32
+    lib.infiniopPixelShuffle.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPixelShuffleDescriptor.restype = c_int32
+    lib.infiniopDestroyPixelShuffleDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/matrix_power.py b/test/infiniop/matrix_power.py
new file mode 100644
index 000000000..7527fe249
--- /dev/null
+++ b/test/infiniop/matrix_power.py
@@ -0,0 +1,104 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+_TEST_CASES_ = [
+    ((4, 4), None, None),
+    ((8, 8), None, None),
+]
+
+_POWERS = [2, 3, 4]
+
+_TEST_CASES = [test_case + (n,) for test_case in _TEST_CASES_ for n in _POWERS]
+
+_TENSOR_DTYPES = [InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(handle, device, shape, input_stride, output_stride, n, dtype, sync=None):
+    print(
+        f"Testing MatrixPower on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} power:{n} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    if shape[-1] != shape[-2]:
+        return
+
+    input = TestTensor(shape, input_stride, dtype, device)
+    output = TestTensor(shape, output_stride, dtype, device)
+
+    output.update_torch_tensor(torch.linalg.matrix_power(input.torch_tensor(), n))
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMatrixPowerDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor, n
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    ws = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMatrixPowerWorkspaceSize(descriptor, ctypes.byref(ws))
+    )
+    workspace = TestWorkspace(ws.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopMatrixPower(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.linalg.matrix_power(input.torch_tensor(), n), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyMatrixPowerDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/pixel_shuffle.py b/test/infiniop/pixel_shuffle.py
new file mode 100644
index 000000000..2fb690fe5
--- /dev/null
+++ b/test/infiniop/pixel_shuffle.py
@@ -0,0 +1,114 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+
+_TEST_CASES = [
+    ((1, 4, 8, 8), None, None, 2),
+    ((2, 16, 4, 4), None, None, 2),
+    ((2, 16, 4, 4), None, None, 4),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+
+def test(handle, device, shape, input_stride, output_stride, upscale, dtype, sync=None):
+    print(
+        f"Testing PixelShuffle on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} upscale:{upscale} "
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    r = upscale
+    assert shape[1] % (r * r) == 0
+
+    input = TestTensor(shape, input_stride, dtype, device)
+
+    output_shape = (
+        shape[0],
+        shape[1] // (r * r),
+        shape[2] * r,
+        shape[3] * r,
+    )
+
+    output = TestTensor(output_shape, output_stride, dtype, device)
+
+    output.update_torch_tensor(
+        torch.nn.functional.pixel_shuffle(input.torch_tensor(), r)
+    )
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePixelShuffleDescriptor(
+            handle, ctypes.byref(descriptor), output.descriptor, input.descriptor, r
+        )
+    )
+
+    input.destroy_desc()
+    output.destroy_desc()
+
+    ws = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPixelShuffleWorkspaceSize(descriptor, ctypes.byref(ws))
+    )
+    workspace = TestWorkspace(ws.value, device)
+
+    def lib_f():
+        check_error(
+            LIBINFINIOP.infiniopPixelShuffle(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_f()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.nn.functional.pixel_shuffle(input.torch_tensor(), r), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_f(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyPixelShuffleDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mTest passed!\033[0m")