InfiniTensor
diff --git a/‎src/infiniop/devices/metax/metax_common.h‎
Lines changed: 2 additions & 0 deletions b/‎src/infiniop/devices/metax/metax_common.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/infiniop/devices/metax/metax_ht2mc.h‎
Lines changed: 2 additions & 0 deletions b/‎src/infiniop/devices/metax/metax_ht2mc.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/infiniop/devices/metax/metax_kernel_common.h‎
Lines changed: 4 additions & 0 deletions b/‎src/infiniop/devices/metax/metax_kernel_common.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/infiniop/ops/addcmul/cuda/kernel.cuh‎
Lines changed: 0 additions & 2 deletions b/‎src/infiniop/ops/addcmul/cuda/kernel.cuh‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/infiniop/ops/addcmul/metax/addcmul_metax.maca‎
Lines changed: 3 additions & 2 deletions b/‎src/infiniop/ops/addcmul/metax/addcmul_metax.maca‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/infiniop/ops/addr/cpu/addr_cpu.cc‎
Lines changed: 1 addition & 1 deletion b/‎src/infiniop/ops/addr/cpu/addr_cpu.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc‎
Lines changed: 22 additions & 13 deletions b/‎src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc‎
Lines changed: 22 additions & 13 deletions
diff --git a/‎src/infiniop/ops/argwhere/moore/argwhere_moore.mu‎
Lines changed: 0 additions & 16 deletions b/‎src/infiniop/ops/argwhere/moore/argwhere_moore.mu‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎src/infiniop/ops/atanh/cuda/kernel.cuh‎
Lines changed: 0 additions & 3 deletions b/‎src/infiniop/ops/atanh/cuda/kernel.cuh‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca‎
Lines changed: 8 additions & 7 deletions b/‎src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca‎
Lines changed: 8 additions & 7 deletions
@@ -6,9 +6,11 @@
 #ifdef ENABLE_METAX_MC_API
 #include <mcblas/mcblas.h>
 #include <mcdnn/mcdnn.h>
+#include <mcr/mc_runtime.h>
 #else
 #include <hcblas/hcblas.h>
 #include <hcdnn/hcdnn.h>
+#include <hcr/hc_runtime.h>
 #endif
 #include <functional>
 #include <memory>
 
@@ -101,4 +101,6 @@
 #define hcGraphDestroy mcGraphDestroy
 #define hcGraphExecDestroy mcGraphExecDestroy
 #define hcGraphLaunch mcGraphLaunch
+#define hcMemsetAsync mcMemsetAsync
+#define hcGetLastError mcGetLastError
 #endif
@@ -1,8 +1,12 @@
 #define INFINIOP_METAX_KERNEL __global__ void
 
 #ifdef ENABLE_METAX_MC_API
+#include <maca_bfloat16.h>
+#include <maca_fp16.h>
 #include <maca_fp8.h>
 #else
+#include <hpcc_bfloat16.h>
+#include <hpcc_fp16.h>
 #include <hpcc_fp8.h>
 #endif
 
 
@@ -1,8 +1,6 @@
 #ifndef __ADDCMUL_CUDA_CUH__
 #define __ADDCMUL_CUDA_CUH__
 
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
 #include <type_traits>
 
 namespace op::addcmul::cuda {
 
@@ -1,3 +1,4 @@
+#include "../../../devices/metax/metax_handle.h"
 #include "../../../elementwise/metax/elementwise_metax.h"
 
 #include "addcmul_metax.h"
@@ -129,7 +130,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
     auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
     auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
 
-    mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
+    hcStream_t metax_stream = reinterpret_cast<hcStream_t>(stream);
 
     constexpr uint32_t BLOCK_SIZE = 256;
     uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
@@ -146,7 +147,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
         t2_ptr,
         desc->getValue());
 
-    CHECK_METAX(mcGetLastError());
+    CHECK_METAX(hcGetLastError());
     return INFINI_STATUS_SUCCESS;
 }
 
 
@@ -1,6 +1,6 @@
 #include "addr_cpu.h"
 #include "../../../devices/cpu/common_cpu.h"
-#include <spdlog/spdlog.h>
+
 namespace op::addr::cpu {
 Descriptor::~Descriptor() = default;
 
 
@@ -37,27 +37,36 @@ infiniStatus_t calculateArgWhere(
     const void *x) {
 
     const Tdata *x_data = reinterpret_cast<const Tdata *>(x);
-    // int64_t *y_data = reinterpret_cast<int64_t *>(y);
-    std::vector<size_t> positions;
-    // #pragma omp parallel for
+
+    std::vector<int64_t> positions;
+    const size_t ndim = info.shapes.size();
+
     for (size_t i = 0; i < info.num_elements; i++) {
-        size_t pos = 0, tem = i;
-        std::vector<size_t> position(info.strides.size());
-        for (size_t j = info.strides.size() - 1; j >= 0; j--) {
-            position[j] = tem % info.shapes[j];
-            tem /= info.shapes[j];
-            pos += position[j] * info.strides[j];
+        size_t pos = 0;
+        size_t tmp = i;
+
+        std::vector<int64_t> coord(ndim);
+
+        // unravel index
+        for (size_t j = ndim; j-- > 0;) {
+            coord[j] = tmp % info.shapes[j];
+            tmp /= info.shapes[j];
+            pos += coord[j] * info.strides[j];
         }
-        if (fabs(x_data[pos] - 0.0f) > 1e-5) {
-            for (auto p : position) {
-                positions.push_back(p);
+
+        // PyTorch semantics: != 0
+        if (x_data[pos] != Tdata(0)) {
+            for (size_t j = 0; j < ndim; j++) {
+                positions.push_back(coord[j]);
             }
         }
     }
 
+    *count = positions.size() / ndim;
+
     *y = new int64_t[positions.size()];
     memcpy(*y, positions.data(), positions.size() * sizeof(int64_t));
-    *count = positions.size() / info.strides.size();
+
     return INFINI_STATUS_SUCCESS;
 }
 
 
@@ -3,16 +3,6 @@
 #include "argwhere_kernel.h"
 #include "argwhere_moore.h"
 #include "infinicore.h"
-#include <spdlog/spdlog.h>
-
-// template <typename T>
-// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N,
-//                             size_t M, const size_t *shapes,
-//                             const ptrdiff_t *strides, size_t ndim,
-//                             size_t *count) {
-//        parallel_block_argwhere_kernel<float><<<1, M / 2, M>>>(
-//         data, results, N, shapes, strides, ndim, count);
-// }
 
 infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N,
                             size_t M, const size_t *shapes,
@@ -90,12 +80,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim,
                     musaMemcpyDeviceToHost, moore_stream);
 
-    // cudaStreamSynchronize(cuda_stream);
-    // for (size_t i = 0; i < (*count) * ndim; i++) {
-    //   spdlog::debug("(*y)[{}]:{}", i, static_cast<size_t *>(*y)[i]);
-    // }
-    // cudaFreeAsync(result, cuda_stream);
-    // cudaFreeAsync(count_cuda, cuda_stream);
     return INFINI_STATUS_SUCCESS;
 }
 
 
@@ -1,9 +1,6 @@
 #ifndef __ATANH_CUDA_H__
 #define __ATANH_CUDA_H__
 
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
 namespace op::atanh::cuda {
 typedef struct AtanhOp {
 public:
 
@@ -1,8 +1,9 @@
 #include "../../../devices/metax/metax_common.h"
 #include "../../../devices/metax/metax_handle.h"
 #include "../../../devices/metax/metax_kernel_common.h"
+
 #include "binary_cross_entropy_with_logits_metax.h"
-#include <mc_runtime.h>
+
 #include <type_traits>
 
 namespace op::bce_with_logits::metax {
@@ -191,7 +192,7 @@ infiniStatus_t Descriptor::calculate(
     const void *pos_weight,
     void *stream) const {
 
-    mcStream_t custream = (mcStream_t)stream;
+    hcStream_t custream = (hcStream_t)stream;
     size_t n = _info.num_elements;
 
     // F16/BF16 + 归约需要 float workspace
@@ -219,7 +220,7 @@ infiniStatus_t Descriptor::calculate(
     case INFINI_DTYPE_F32: {
         // 如果是规约操作，计算前需将输出位置清零
         if (_reduction != INFINIOP_REDUCTION_NONE) {
-            mcMemsetAsync(out, 0, sizeof(float), custream);
+            hcMemsetAsync(out, 0, sizeof(float), custream);
         }
 
         bce_logits_kernel<float, float><<<grid, block, 0, custream>>>(
@@ -255,7 +256,7 @@ infiniStatus_t Descriptor::calculate(
             out_raw = out;
         } else {
             workspace_f = static_cast<float *>(workspace);
-            mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
             out_raw = workspace_f;
         }
 
@@ -294,7 +295,7 @@ infiniStatus_t Descriptor::calculate(
             out_raw = out;
         } else {
             workspace_f = static_cast<float *>(workspace);
-            mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
             out_raw = workspace_f;
         }
 
@@ -324,8 +325,8 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
-    mcError_t err = mcGetLastError();
-    if (err != mcSuccess) {
+    hcError_t err = hcGetLastError();
+    if (err != hcSuccess) {
         return INFINI_STATUS_INTERNAL_ERROR;
     }
     return INFINI_STATUS_SUCCESS;