-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCMakeLists.txt
More file actions
199 lines (173 loc) · 7.37 KB
/
CMakeLists.txt
File metadata and controls
199 lines (173 loc) · 7.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
cmake_minimum_required(VERSION 3.18)
project(matrix-ops-lib
VERSION 0.1.0
DESCRIPTION "GPU-accelerated matrix operations library with CUDA tensor core support"
LANGUAGES CXX CUDA
)
# ============================================================================
# C++ / CUDA standards
# ============================================================================
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
# ============================================================================
# Default build type
# ============================================================================
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
endif()
# ============================================================================
# CUDA architecture selection
# Defaults to Ampere (SM 8.0) + Hopper (SM 9.0). Override with:
# cmake -DCMAKE_CUDA_ARCHITECTURES="70;80;89;90" ..
# ============================================================================
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 80 90)
message(STATUS "CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES} (default)")
endif()
# ============================================================================
# Find CUDA libraries
# ============================================================================
find_package(CUDAToolkit REQUIRED)
message(STATUS "CUDA version: ${CUDAToolkit_VERSION}")
# ============================================================================
# Custom CMake modules
# ============================================================================
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
include(TensorCoreDetection)
# ============================================================================
# Compiler flags
# ============================================================================
# Enable all CUDA warnings and optimise for the target architecture.
set(CUDA_COMMON_FLAGS
--use_fast_math # Enable FP32 fast math intrinsics
--expt-relaxed-constexpr # Allow constexpr in device code
--expt-extended-lambda # Allow lambdas in device code
-Xcompiler=-Wall # Host compiler warnings
-Xcompiler=-Wextra
-Xcompiler=-Wno-unused-parameter
)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_COMMON_FLAGS -g -G) # Device-side debug info
else()
list(APPEND CUDA_COMMON_FLAGS -O3 --ptxas-options=-v)
endif()
# ============================================================================
# Core library sources
# ============================================================================
set(MATLIB_SOURCES
src/blas/gemm/gemm_naive.cu
src/blas/gemm/gemm_tiled.cu
src/blas/gemm/gemm_registers.cu
src/blas/gemm/gemm_tensor_core.cu
src/blas/gemm/gemm_bf16.cu
src/blas/gemm/gemm_dispatcher.cu
src/blas/gemm/fp_convert.cu
src/blas/gemv.cu
src/blas/trsm.cu
src/blas/batched_gemm.cu
src/sparse/spmv_csr.cu
src/sparse/merge_path_spmv.cu
)
# Static library (used by tests and benchmarks).
add_library(matlib_static STATIC ${MATLIB_SOURCES})
target_include_directories(matlib_static PUBLIC
"${CMAKE_SOURCE_DIR}/include"
"${CMAKE_SOURCE_DIR}/kernels"
)
target_link_libraries(matlib_static PUBLIC
CUDA::cudart
CUDA::cublas
CUDA::cusolver
CUDA::cusparse
)
target_compile_options(matlib_static PRIVATE
"$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_COMMON_FLAGS}>"
)
# ============================================================================
# Unit tests
# ============================================================================
option(MATLIB_BUILD_TESTS "Build unit and integration tests" ON)
if(MATLIB_BUILD_TESTS)
find_package(GTest QUIET)
if(NOT GTest_FOUND)
# Automatically download GoogleTest via FetchContent.
include(FetchContent)
FetchContent_Declare(
googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG v1.14.0
)
FetchContent_MakeAvailable(googletest)
endif()
enable_testing()
# Macro to add a test executable with common setup.
macro(add_matlib_test name source)
add_executable(${name} ${source})
target_include_directories(${name} PRIVATE "${CMAKE_SOURCE_DIR}/include")
target_link_libraries(${name} PRIVATE
matlib_static
GTest::gtest_main
CUDA::cudart
CUDA::cublas
CUDA::cusolver
)
add_test(NAME ${name} COMMAND ${name})
endmacro()
add_matlib_test(test_gemm tests/unit/test_gemm.cpp)
add_matlib_test(test_trsm tests/unit/test_trsm.cpp)
add_matlib_test(test_tensor_core tests/unit/test_tensor_core.cpp)
add_matlib_test(test_tensor_gemm tests/unit/test_tensor_core.cpp)
add_matlib_test(test_bf16_gemm tests/unit/test_bf16_gemm.cpp)
add_matlib_test(test_accuracy tests/numerical/test_accuracy.cpp)
add_matlib_test(test_conditioning tests/numerical/test_conditioning.cpp)
add_matlib_test(test_cublas_compat tests/integration/test_cublas_compat.cpp)
endif()
# ============================================================================
# Benchmarks
# ============================================================================
option(MATLIB_BUILD_BENCHMARKS "Build benchmark executables" ON)
if(MATLIB_BUILD_BENCHMARKS)
macro(add_matlib_bench name source)
add_executable(${name} ${source})
target_include_directories(${name} PRIVATE "${CMAKE_SOURCE_DIR}/include")
target_link_libraries(${name} PRIVATE
matlib_static
CUDA::cudart
CUDA::cublas
)
endmacro()
add_matlib_bench(gemm_bench benchmarks/gemm_bench.cpp)
add_matlib_bench(gemm_sweep benchmarks/gemm_sweep.cpp)
add_matlib_bench(compare_cublas benchmarks/compare_cublas.cpp)
add_matlib_bench(kernel_search benchmarks/autotuning/kernel_search.cpp)
endif()
# ============================================================================
# Python bindings (optional - requires pybind11)
# ============================================================================
option(MATLIB_BUILD_PYTHON "Build Python pybind11 extension" OFF)
if(MATLIB_BUILD_PYTHON)
include(PythonBindings)
endif()
# ============================================================================
# Install
# ============================================================================
install(TARGETS matlib_static
ARCHIVE DESTINATION lib
)
install(DIRECTORY include/matlib DESTINATION include)
install(FILES include/matlib.hpp DESTINATION include)
# ============================================================================
# Summary
# ============================================================================
message(STATUS "")
message(STATUS "matrix-ops-lib configuration summary:")
message(STATUS " Build type : ${CMAKE_BUILD_TYPE}")
message(STATUS " CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS " FP16 tensor cores : ${HAS_TC}")
message(STATUS " BF16 tensor cores : ${HAS_BF16}")
message(STATUS " Build tests : ${MATLIB_BUILD_TESTS}")
message(STATUS " Build benchmarks : ${MATLIB_BUILD_BENCHMARKS}")
message(STATUS " Build Python ext : ${MATLIB_BUILD_PYTHON}")
message(STATUS "")