Skip to content

Commit 5e5098b

Browse files
committed
feat(hygon-add): add Hygon backend support for Add
- Add `WITH_HYGON` build support and a Hygon `Add` backend that reuses the shared CUDA implementation. - Detect DTK `nvcc` from the Hygon toolkit layout and auto-detect the GPU arch from `rocminfo`. - Treat Hygon as a CUDA-like backend in shared data type, cast, and kernel helper headers. - Skip the Hygon `gemm` example for now and ignore `build-*` temporary directories. - Verified with `pip install -e .[dev]` and `pytest tests/test_add.py`.
1 parent 2816b58 commit 5e5098b

8 files changed

Lines changed: 232 additions & 4 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Generated files
22
build/
3+
build-*/
4+
cmake-build-*/
35
generated/
46

57
# Prerequisites

CMakeLists.txt

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@ set(PYBIND11_ENABLE_EXTRAS ON)
1111
option(WITH_CPU "Enable CPU backend" OFF)
1212
option(WITH_NVIDIA "Enable CUDA backend" OFF)
1313
option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF)
14+
option(WITH_HYGON "Enable Hygon GPU backend" OFF)
1415
option(WITH_METAX "Enable MetaX backend" OFF)
1516
option(WITH_CAMBRICON "Enable Cambricon backend" OFF)
1617
option(WITH_MOORE "Enable Moore backend" OFF)
1718

1819
option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
1920
option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)
2021

22+
set(_DEFAULT_HYGON_DTK_ROOT "/opt/dtk")
23+
2124
if(AUTO_DETECT_DEVICES)
2225
message(STATUS "Auto-detecting available devices...")
2326

@@ -37,6 +40,13 @@ if(AUTO_DETECT_DEVICES)
3740
message(STATUS "Auto-detected Iluvatar environment.")
3841
endif()
3942

43+
if(DEFINED ENV{DTK_ROOT} OR
44+
EXISTS "${_DEFAULT_HYGON_DTK_ROOT}/cuda/bin/nvcc" OR
45+
EXISTS "${_DEFAULT_HYGON_DTK_ROOT}/cuda/cuda/bin/nvcc")
46+
set(WITH_HYGON ON)
47+
message(STATUS "Auto-detected Hygon environment.")
48+
endif()
49+
4050
if(DEFINED ENV{MACA_PATH})
4151
set(WITH_METAX ON)
4252
message(STATUS "Auto-detected MetaX environment from MACA_PATH")
@@ -77,14 +87,14 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
7787

7888
# Only one CUDA-like GPU backend can be enabled at a time.
7989
set(_gpu_backend_count 0)
80-
foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE)
90+
foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_HYGON WITH_METAX WITH_MOORE)
8191
if(${_gpu_backend})
8292
math(EXPR _gpu_backend_count "${_gpu_backend_count} + 1")
8393
endif()
8494
endforeach()
8595

8696
if(_gpu_backend_count GREATER 1)
87-
message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, and `WITH_MOORE` are mutually exclusive. Build one GPU backend at a time.")
97+
message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_HYGON`, `WITH_METAX`, and `WITH_MOORE` are mutually exclusive. Build one GPU backend at a time.")
8898
endif()
8999

90100
if(WITH_NVIDIA)
@@ -111,6 +121,64 @@ if(WITH_ILUVATAR)
111121
find_package(CUDAToolkit REQUIRED)
112122
endif()
113123

124+
if(WITH_HYGON)
125+
add_compile_definitions(WITH_HYGON=1)
126+
set(DTK_ROOT $ENV{DTK_ROOT})
127+
if(NOT DTK_ROOT)
128+
set(DTK_ROOT "${_DEFAULT_HYGON_DTK_ROOT}")
129+
endif()
130+
if(NOT EXISTS "${DTK_ROOT}")
131+
message(FATAL_ERROR "`WITH_HYGON` is `ON` but `DTK_ROOT` (`${DTK_ROOT}`) does not exist.")
132+
endif()
133+
134+
set(_HYGON_ARCH_DEFAULT "gfx906")
135+
if(DEFINED ENV{HYGON_ARCH} AND NOT "$ENV{HYGON_ARCH}" STREQUAL "")
136+
set(_HYGON_ARCH_DEFAULT "$ENV{HYGON_ARCH}")
137+
else()
138+
find_program(HYGON_ROCMINFO_EXECUTABLE NAMES rocminfo HINTS "${DTK_ROOT}/bin")
139+
if(HYGON_ROCMINFO_EXECUTABLE)
140+
execute_process(
141+
COMMAND ${HYGON_ROCMINFO_EXECUTABLE}
142+
OUTPUT_VARIABLE _HYGON_ROCMINFO_OUTPUT
143+
ERROR_QUIET
144+
OUTPUT_STRIP_TRAILING_WHITESPACE
145+
)
146+
string(REGEX MATCH "gfx[0-9]+" _HYGON_ARCH_AUTO "${_HYGON_ROCMINFO_OUTPUT}")
147+
if(_HYGON_ARCH_AUTO)
148+
set(_HYGON_ARCH_DEFAULT "${_HYGON_ARCH_AUTO}")
149+
endif()
150+
endif()
151+
endif()
152+
153+
set(HYGON_ARCH "${_HYGON_ARCH_DEFAULT}" CACHE STRING "Hygon GPU architecture")
154+
set(HYGON_CUDA_ROOT "${DTK_ROOT}/cuda")
155+
if(EXISTS "${DTK_ROOT}/cuda/cuda/bin/nvcc")
156+
set(HYGON_CUDA_ROOT "${DTK_ROOT}/cuda/cuda")
157+
endif()
158+
159+
if(NOT EXISTS "${HYGON_CUDA_ROOT}/bin/nvcc")
160+
message(FATAL_ERROR "`WITH_HYGON` is `ON` but `${HYGON_CUDA_ROOT}/bin/nvcc` was not found. Checked `${DTK_ROOT}/cuda/bin/nvcc` and `${DTK_ROOT}/cuda/cuda/bin/nvcc`.")
161+
endif()
162+
163+
set(CMAKE_CUDA_COMPILER "${HYGON_CUDA_ROOT}/bin/nvcc" CACHE FILEPATH "Hygon CUDA compiler (DTK nvcc)")
164+
set(CUDAToolkit_ROOT "${HYGON_CUDA_ROOT}" CACHE PATH "Hygon CUDA toolkit root")
165+
set(CMAKE_CUDA_ARCHITECTURES OFF CACHE STRING "Disable default CUDA arch flags for Hygon" FORCE)
166+
set(CMAKE_CUDA_FLAGS "-std=c++17 -fPIC -arch=${HYGON_ARCH} -Wno-return-type -Wno-error=unused-private-field" CACHE STRING "Hygon CUDA flags")
167+
set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Hygon")
168+
169+
# DTK's nvcc wrapper may invoke `nvcc` by name during compiler checks.
170+
set(ENV{PATH} "${HYGON_CUDA_ROOT}/bin:$ENV{PATH}")
171+
172+
include_directories("${DTK_ROOT}/include")
173+
include_directories("${HYGON_CUDA_ROOT}/include")
174+
link_directories("${DTK_ROOT}/lib")
175+
link_directories("${HYGON_CUDA_ROOT}/lib64")
176+
177+
message(STATUS "Hygon: CUDA compiler ${CMAKE_CUDA_COMPILER}, arch ${HYGON_ARCH}, DTK root ${DTK_ROOT}")
178+
enable_language(CUDA)
179+
find_package(CUDAToolkit REQUIRED)
180+
endif()
181+
114182
if(WITH_METAX)
115183
add_compile_definitions(WITH_METAX=1)
116184

@@ -179,7 +247,7 @@ if(WITH_CAMBRICON)
179247
endif()
180248

181249
# If all other platforms are not enabled, CPU is enabled by default.
182-
if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON)
250+
if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_HYGON AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON)
183251
add_compile_definitions(WITH_CPU=1)
184252
endif()
185253

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,17 @@ For the `<OPTIONS>`:
3838
|----------------------------------------|------------------------------------|:-:
3939
| `-DWITH_CPU=[ON\|OFF]` | Compile the CPU implementation | n
4040
| `-DWITH_NVIDIA=[ON\|OFF]` | Compile the NVIDIA implementation | n
41+
| `-DWITH_ILUVATAR=[ON\|OFF]` | Compile the Iluvatar implementation| n
42+
| `-DWITH_HYGON=[ON\|OFF]` | Compile the Hygon implementation | n
4143
| `-DWITH_METAX=[ON\|OFF]` | Compile the MetaX implementation | n
4244
| `-DGENERATE_PYTHON_BINDINGS=[ON\|OFF]` | Generate Python bindings | n
4345

4446
*Note: If no accelerator options are provided, `WITH_CPU` is enabled by default.*
4547

48+
For Hygon builds, set `DTK_ROOT` to the DTK installation root if it is not
49+
installed at `/opt/dtk`. You can override the default DCU arch with
50+
`-DHYGON_ARCH=<arch>` when configuring CMake.
51+
4652
## 🚀 Running Examples
4753
After a successful build, the executables are located in the `build/examples` directory.
4854

examples/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ file(GLOB_RECURSE EXAMPLE_SOURCES CONFIGURE_DEPENDS "*.cc")
22

33
# Iterate through each file and create an executable.
44
foreach(source_file ${EXAMPLE_SOURCES})
5+
if(WITH_HYGON AND source_file MATCHES "/gemm\\.cc$")
6+
continue()
7+
endif()
8+
59
get_filename_component(example_name ${source_file} NAME_WE)
610

711
add_executable(${example_name} ${source_file})

examples/runtime_api.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,15 @@
1919
#define DEVICE_MEMCPY_HOST_TO_DEVICE cudaMemcpyHostToDevice
2020
#define DEVICE_MEMCPY_DEVICE_TO_HOST cudaMemcpyDeviceToHost
2121
#define DEFAULT_DEVICE_TYPE Device::Type::kIluvatar
22+
#elif WITH_HYGON
23+
#include <cuda_runtime.h>
24+
#define DEVICE_MALLOC cudaMalloc
25+
#define DEVICE_FREE cudaFree
26+
#define DEVICE_MEMCPY cudaMemcpy
27+
#define DEVICE_MEMSET cudaMemset
28+
#define DEVICE_MEMCPY_HOST_TO_DEVICE cudaMemcpyHostToDevice
29+
#define DEVICE_MEMCPY_DEVICE_TO_HOST cudaMemcpyDeviceToHost
30+
#define DEFAULT_DEVICE_TYPE Device::Type::kHygon
2231
#elif WITH_METAX
2332
#include <mcr/mc_runtime.h>
2433
#define DEVICE_MALLOC mcMalloc

src/CMakeLists.txt

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,34 @@ if(WITH_ILUVATAR)
7777
list(APPEND DEVICE_LIST "iluvatar")
7878
endif()
7979

80+
if(WITH_HYGON)
81+
set(HYGON_PATTERNS
82+
"cuda/*.cc"
83+
"cuda/*.cpp"
84+
"cuda/*.cu"
85+
"hygon/*.cc"
86+
"hygon/*.cpp"
87+
"hygon/*.cu"
88+
)
89+
90+
file(GLOB_RECURSE HYGON_SOURCES CONFIGURE_DEPENDS ${HYGON_PATTERNS})
91+
92+
enable_language(CUDA)
93+
94+
target_compile_definitions(infiniops PUBLIC WITH_HYGON=1)
95+
target_sources(infiniops PRIVATE ${HYGON_SOURCES})
96+
97+
find_package(CUDAToolkit REQUIRED)
98+
target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas)
99+
100+
set_target_properties(infiniops PROPERTIES
101+
CUDA_STANDARD 17
102+
CUDA_STANDARD_REQUIRED ON
103+
)
104+
105+
list(APPEND DEVICE_LIST "hygon")
106+
endif()
107+
80108
if(WITH_METAX)
81109
set(METAX_PATTERNS
82110
"cuda/*.cc"
@@ -191,7 +219,7 @@ if(GENERATE_PYTHON_BINDINGS)
191219
set(PYBIND11_SOURCES "${PROJECT_SOURCE_DIR}/generated/bindings/ops.cc")
192220

193221
# TODO: There might be a better solution.
194-
if(WITH_NVIDIA OR WITH_ILUVATAR)
222+
if(WITH_NVIDIA OR WITH_ILUVATAR OR WITH_HYGON)
195223
set_source_files_properties(${PYBIND11_SOURCES} PROPERTIES LANGUAGE CUDA)
196224
endif()
197225

src/hygon/add/kernel.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#ifndef INFINI_OPS_HYGON_ADD_KERNEL_H_
2+
#define INFINI_OPS_HYGON_ADD_KERNEL_H_
3+
4+
#include <utility>
5+
6+
#include "cuda/add/kernel.h"
7+
#include "hygon/device_.h"
8+
9+
namespace infini::ops {
10+
11+
namespace add {
12+
13+
struct HygonBackend {
14+
using stream_t = cudaStream_t;
15+
16+
static constexpr Device::Type kDeviceType = Device::Type::kHygon;
17+
18+
static constexpr auto malloc = [](auto&&... args) {
19+
return cudaMalloc(std::forward<decltype(args)>(args)...);
20+
};
21+
22+
static constexpr auto memcpy = cudaMemcpy;
23+
24+
static constexpr auto free = cudaFree;
25+
26+
static constexpr auto memcpyH2D = cudaMemcpyHostToDevice;
27+
28+
static int GetOptimalBlockSize() {
29+
return ComputeOptimalBlockSize(QueryMaxThreadsPerBlock());
30+
}
31+
};
32+
33+
} // namespace add
34+
35+
template <>
36+
class Operator<Add, Device::Type::kHygon> : public CudaAdd<add::HygonBackend> {
37+
public:
38+
using CudaAdd<add::HygonBackend>::CudaAdd;
39+
};
40+
41+
} // namespace infini::ops
42+
43+
#endif

src/hygon/device_.h

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#ifndef INFINI_OPS_HYGON_DEVICE__H_
2+
#define INFINI_OPS_HYGON_DEVICE__H_
3+
4+
#include <cassert>
5+
#include <vector>
6+
7+
// clang-format off
8+
#include <cuda_bf16.h>
9+
#include <cuda_fp16.h>
10+
#include <cuda_runtime.h>
11+
// clang-format on
12+
13+
#include "cuda/caster_.h"
14+
#include "data_type.h"
15+
#include "device.h"
16+
17+
namespace infini::ops {
18+
19+
using cuda_bfloat16 = nv_bfloat16;
20+
21+
using cuda_bfloat162 = nv_bfloat162;
22+
23+
template <>
24+
struct TypeMap<Device::Type::kHygon, DataType::kFloat16> {
25+
using type = half;
26+
};
27+
28+
template <>
29+
struct TypeMap<Device::Type::kHygon, DataType::kBFloat16> {
30+
using type = __nv_bfloat16;
31+
};
32+
33+
// Caches `cudaDeviceProp` per device, initialized once at first access.
34+
class DevicePropertyCache {
35+
public:
36+
static const cudaDeviceProp& GetCurrentDeviceProps() {
37+
int device_id = 0;
38+
cudaGetDevice(&device_id);
39+
return GetDeviceProps(device_id);
40+
}
41+
42+
static const cudaDeviceProp& GetDeviceProps(int device_id) {
43+
static std::vector<cudaDeviceProp> cache = []() {
44+
int count = 0;
45+
cudaGetDeviceCount(&count);
46+
if (count == 0) return std::vector<cudaDeviceProp>{};
47+
std::vector<cudaDeviceProp> props(count);
48+
for (int i = 0; i < count; ++i) {
49+
cudaGetDeviceProperties(&props[i], i);
50+
}
51+
return props;
52+
}();
53+
54+
assert(device_id >= 0 && device_id < static_cast<int>(cache.size()));
55+
return cache[device_id];
56+
}
57+
};
58+
59+
inline int QueryMaxThreadsPerBlock() {
60+
return DevicePropertyCache::GetCurrentDeviceProps().maxThreadsPerBlock;
61+
}
62+
63+
template <>
64+
struct Caster<Device::Type::kHygon> : CudaCasterImpl<Device::Type::kHygon> {};
65+
66+
} // namespace infini::ops
67+
68+
#endif

0 commit comments

Comments
 (0)