Skip to content

Commit ed61b51

Browse files
authored
Merge pull request #1106 from InfiniTensor/issue/1105
issue/1105 - fix hpcc compilation
2 parents f44330d + 3dfb950 commit ed61b51

31 files changed

Lines changed: 95 additions & 174 deletions

File tree

src/infiniop/devices/metax/metax_common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
#ifdef ENABLE_METAX_MC_API
77
#include <mcblas/mcblas.h>
88
#include <mcdnn/mcdnn.h>
9+
#include <mcr/mc_runtime.h>
910
#else
1011
#include <hcblas/hcblas.h>
1112
#include <hcdnn/hcdnn.h>
13+
#include <hcr/hc_runtime.h>
1214
#endif
1315
#include <functional>
1416
#include <memory>

src/infiniop/devices/metax/metax_ht2mc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,4 +101,6 @@
101101
#define hcGraphDestroy mcGraphDestroy
102102
#define hcGraphExecDestroy mcGraphExecDestroy
103103
#define hcGraphLaunch mcGraphLaunch
104+
#define hcMemsetAsync mcMemsetAsync
105+
#define hcGetLastError mcGetLastError
104106
#endif

src/infiniop/devices/metax/metax_kernel_common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
#define INFINIOP_METAX_KERNEL __global__ void
22

33
#ifdef ENABLE_METAX_MC_API
4+
#include <maca_bfloat16.h>
5+
#include <maca_fp16.h>
46
#include <maca_fp8.h>
57
#else
8+
#include <hpcc_bfloat16.h>
9+
#include <hpcc_fp16.h>
610
#include <hpcc_fp8.h>
711
#endif
812

src/infiniop/ops/addcmul/cuda/kernel.cuh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
#ifndef __ADDCMUL_CUDA_CUH__
22
#define __ADDCMUL_CUDA_CUH__
33

4-
#include <cuda_bf16.h>
5-
#include <cuda_fp16.h>
64
#include <type_traits>
75

86
namespace op::addcmul::cuda {

src/infiniop/ops/addcmul/metax/addcmul_metax.maca

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "../../../devices/metax/metax_handle.h"
12
#include "../../../elementwise/metax/elementwise_metax.h"
23

34
#include "addcmul_metax.h"
@@ -129,7 +130,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
129130
auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
130131
auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
131132

132-
mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
133+
hcStream_t metax_stream = reinterpret_cast<hcStream_t>(stream);
133134

134135
constexpr uint32_t BLOCK_SIZE = 256;
135136
uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
@@ -146,7 +147,7 @@ static inline infiniStatus_t launch_addcmul_kernel(
146147
t2_ptr,
147148
desc->getValue());
148149

149-
CHECK_METAX(mcGetLastError());
150+
CHECK_METAX(hcGetLastError());
150151
return INFINI_STATUS_SUCCESS;
151152
}
152153

src/infiniop/ops/addr/cpu/addr_cpu.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "addr_cpu.h"
22
#include "../../../devices/cpu/common_cpu.h"
3-
#include <spdlog/spdlog.h>
3+
44
namespace op::addr::cpu {
55
Descriptor::~Descriptor() = default;
66

src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,27 +37,36 @@ infiniStatus_t calculateArgWhere(
3737
const void *x) {
3838

3939
const Tdata *x_data = reinterpret_cast<const Tdata *>(x);
40-
// int64_t *y_data = reinterpret_cast<int64_t *>(y);
41-
std::vector<size_t> positions;
42-
// #pragma omp parallel for
40+
41+
std::vector<int64_t> positions;
42+
const size_t ndim = info.shapes.size();
43+
4344
for (size_t i = 0; i < info.num_elements; i++) {
44-
size_t pos = 0, tem = i;
45-
std::vector<size_t> position(info.strides.size());
46-
for (size_t j = info.strides.size() - 1; j >= 0; j--) {
47-
position[j] = tem % info.shapes[j];
48-
tem /= info.shapes[j];
49-
pos += position[j] * info.strides[j];
45+
size_t pos = 0;
46+
size_t tmp = i;
47+
48+
std::vector<int64_t> coord(ndim);
49+
50+
// unravel index
51+
for (size_t j = ndim; j-- > 0;) {
52+
coord[j] = tmp % info.shapes[j];
53+
tmp /= info.shapes[j];
54+
pos += coord[j] * info.strides[j];
5055
}
51-
if (fabs(x_data[pos] - 0.0f) > 1e-5) {
52-
for (auto p : position) {
53-
positions.push_back(p);
56+
57+
// PyTorch semantics: != 0
58+
if (x_data[pos] != Tdata(0)) {
59+
for (size_t j = 0; j < ndim; j++) {
60+
positions.push_back(coord[j]);
5461
}
5562
}
5663
}
5764

65+
*count = positions.size() / ndim;
66+
5867
*y = new int64_t[positions.size()];
5968
memcpy(*y, positions.data(), positions.size() * sizeof(int64_t));
60-
*count = positions.size() / info.strides.size();
69+
6170
return INFINI_STATUS_SUCCESS;
6271
}
6372

src/infiniop/ops/argwhere/moore/argwhere_moore.mu

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,6 @@
33
#include "argwhere_kernel.h"
44
#include "argwhere_moore.h"
55
#include "infinicore.h"
6-
#include <spdlog/spdlog.h>
7-
8-
// template <typename T>
9-
// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N,
10-
// size_t M, const size_t *shapes,
11-
// const ptrdiff_t *strides, size_t ndim,
12-
// size_t *count) {
13-
// parallel_block_argwhere_kernel<float><<<1, M / 2, M>>>(
14-
// data, results, N, shapes, strides, ndim, count);
15-
// }
166

177
infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N,
188
size_t M, const size_t *shapes,
@@ -90,12 +80,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
9080
musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim,
9181
musaMemcpyDeviceToHost, moore_stream);
9282

93-
// cudaStreamSynchronize(cuda_stream);
94-
// for (size_t i = 0; i < (*count) * ndim; i++) {
95-
// spdlog::debug("(*y)[{}]:{}", i, static_cast<size_t *>(*y)[i]);
96-
// }
97-
// cudaFreeAsync(result, cuda_stream);
98-
// cudaFreeAsync(count_cuda, cuda_stream);
9983
return INFINI_STATUS_SUCCESS;
10084
}
10185

src/infiniop/ops/atanh/cuda/kernel.cuh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
#ifndef __ATANH_CUDA_H__
22
#define __ATANH_CUDA_H__
33

4-
#include <cuda_bf16.h>
5-
#include <cuda_fp16.h>
6-
74
namespace op::atanh::cuda {
85
typedef struct AtanhOp {
96
public:

src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
#include "../../../devices/metax/metax_common.h"
22
#include "../../../devices/metax/metax_handle.h"
33
#include "../../../devices/metax/metax_kernel_common.h"
4+
45
#include "binary_cross_entropy_with_logits_metax.h"
5-
#include <mc_runtime.h>
6+
67
#include <type_traits>
78

89
namespace op::bce_with_logits::metax {
@@ -191,7 +192,7 @@ infiniStatus_t Descriptor::calculate(
191192
const void *pos_weight,
192193
void *stream) const {
193194

194-
mcStream_t custream = (mcStream_t)stream;
195+
hcStream_t custream = (hcStream_t)stream;
195196
size_t n = _info.num_elements;
196197

197198
// F16/BF16 + 归约需要 float workspace
@@ -219,7 +220,7 @@ infiniStatus_t Descriptor::calculate(
219220
case INFINI_DTYPE_F32: {
220221
// 如果是规约操作,计算前需将输出位置清零
221222
if (_reduction != INFINIOP_REDUCTION_NONE) {
222-
mcMemsetAsync(out, 0, sizeof(float), custream);
223+
hcMemsetAsync(out, 0, sizeof(float), custream);
223224
}
224225

225226
bce_logits_kernel<float, float><<<grid, block, 0, custream>>>(
@@ -255,7 +256,7 @@ infiniStatus_t Descriptor::calculate(
255256
out_raw = out;
256257
} else {
257258
workspace_f = static_cast<float *>(workspace);
258-
mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
259+
hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
259260
out_raw = workspace_f;
260261
}
261262

@@ -294,7 +295,7 @@ infiniStatus_t Descriptor::calculate(
294295
out_raw = out;
295296
} else {
296297
workspace_f = static_cast<float *>(workspace);
297-
mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
298+
hcMemsetAsync(workspace_f, 0, sizeof(float), custream);
298299
out_raw = workspace_f;
299300
}
300301

@@ -324,8 +325,8 @@ infiniStatus_t Descriptor::calculate(
324325
return INFINI_STATUS_BAD_TENSOR_DTYPE;
325326
}
326327

327-
mcError_t err = mcGetLastError();
328-
if (err != mcSuccess) {
328+
hcError_t err = hcGetLastError();
329+
if (err != hcSuccess) {
329330
return INFINI_STATUS_INTERNAL_ERROR;
330331
}
331332
return INFINI_STATUS_SUCCESS;

0 commit comments

Comments
 (0)