InfiniOps/src/cuda/kernel_commons.h at d4f17d920f123641fc3d2b431155f81fa8913eeb · InfiniTensor/InfiniOps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#ifndef INFINI_OPS_COMMON_CUDA_KERNEL_COMMONS_H_
#define INFINI_OPS_COMMON_CUDA_KERNEL_COMMONS_H_

#ifdef WITH_NVIDIA
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162;
#elif defined(WITH_ILUVATAR)
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162;
#elif defined(WITH_METAX)
#include <mcr/mc_runtime.h>
using cuda_bfloat16 = maca_bfloat16;
using cuda_bfloat162 = maca_bfloat162;
#elif defined(WITH_MOORE)
#include <musa_bf16.h>
#include <musa_fp16.h>
#include <musa_runtime.h>
using cuda_bfloat16 = __mt_bfloat16;
using cuda_bfloat162 = __mt_bfloat162;
#endif

#include <cstdlib>
#include <iostream>
#include <vector>

#include "caster.h"

namespace infini::ops {

using AllCudaBlockSizes = List<128, 256, 512, 1024, 2048>;

#if defined(WITH_NVIDIA) || defined(WITH_ILUVATAR)
// Cache `cudaDeviceProp` per device, initialized once at first access.
class DevicePropertyCache {
 public:
  static const cudaDeviceProp& GetCurrentDeviceProps() {
    int device_id = 0;
    cudaGetDevice(&device_id);
    return GetDeviceProps(device_id);
  }

  static const cudaDeviceProp& GetDeviceProps(int device_id) {
    static std::vector<cudaDeviceProp> cache = []() {
      int count = 0;
      cudaGetDeviceCount(&count);
      if (count == 0) return std::vector<cudaDeviceProp>{};
      std::vector<cudaDeviceProp> props(count);
      for (int i = 0; i < count; ++i) {
        cudaGetDeviceProperties(&props[i], i);
      }
      return props;
    }();

    if (device_id < 0 || device_id >= static_cast<int>(cache.size())) {
      std::cerr << "error: `device_id` " << device_id << " is out of range [0, "
                << cache.size() << ") in `GetDeviceProps`\n";
      std::abort();
    }
    return cache[device_id];
  }
};

inline int QueryMaxThreadsPerBlock() {
  return DevicePropertyCache::GetCurrentDeviceProps().maxThreadsPerBlock;
}
#elif defined(WITH_METAX)
inline int QueryMaxThreadsPerBlock() {
  // TODO: Add MCR device properties query for Metax.
  return 256;
}
#elif defined(WITH_MOORE)
inline int QueryMaxThreadsPerBlock() {
  int device = 0;
  musaGetDevice(&device);
  musaDeviceProp prop;
  musaGetDeviceProperties(&prop, device);
  return prop.maxThreadsPerBlock;
}
#endif

// Get optimal block size based on GPU hardware architecture.
inline int GetOptimalBlockSize() {
  int max_threads = QueryMaxThreadsPerBlock();
  if (max_threads >= 2048) {
    return 2048;
  } else if (max_threads >= 1024) {
    return 1024;
  } else if (max_threads >= 512) {
    return 512;
  } else if (max_threads >= 256) {
    return 256;
  } else {
    return 128;
  }
}

__forceinline__ __device__ __host__ size_t
IndexToOffset(size_t flat_index, size_t ndim, const size_t* shape,
              const ptrdiff_t* strides) {
  size_t res = 0;
  for (size_t i = ndim; i-- > 0;) {
    res += (flat_index % shape[i]) * strides[i];
    flat_index /= shape[i];
  }
  return res;
}

}  // namespace infini::ops

#endif