Skip to content

Commit 6100feb

Browse files
committed
feat: introduce Eigen and OpenMP to accelerate cpu linear op
1 parent b5d6a6b commit 6100feb

6 files changed

Lines changed: 77 additions & 41 deletions

File tree

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
[submodule "third_party/gflags"]
55
path = third_party/gflags
66
url = git@github.com:gflags/gflags.git
7+
[submodule "third_party/eigen"]
8+
path = third_party/eigen
9+
url = https://gitlab.com/libeigen/eigen.git

CMakeLists.txt

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,20 @@ set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
2121
add_subdirectory(third_party/glog)
2222
include_directories(${glog_SOURCE_DIR}/src)
2323

24+
# Add eigen
25+
find_package(OpenMP REQUIRED)
26+
# find_package(OpenBLAS REQUIRED)
27+
# include_directories(${OpenBLAS_INCLUDE_DIR})
28+
add_subdirectory(third_party/eigen)
29+
include_directories(${PROJECT_SOURCE_DIR}/third_party/eigen)
30+
# add_definitions(-DEIGEN_USE_BLAS)
31+
2432
if(USE_CUDA)
2533
add_compile_definitions(USE_CUDA=1)
2634
enable_language(CUDA)
2735
include(FindCUDAToolkit)
2836

2937
# enable CUDA-related compilation options
30-
# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CMAKE_INCLUDE_PATH} -Xcompiler -fPIC --expt-relaxed-constexpr")
3138
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
3239

3340
include_directories(${PROJECT_SOURCE_DIR})
@@ -37,25 +44,35 @@ if(USE_CUDA)
3744

3845
add_library(infini_train STATIC ${SRC})
3946
set_target_properties(infini_train PROPERTIES CUDA_ARCHITECTURES "70;80")
40-
target_link_libraries(infini_train glog gflags CUDA::cudart CUDA::cublas)
47+
target_link_libraries(infini_train glog gflags CUDA::cudart CUDA::cublas Eigen3::Eigen)
4148

4249
# Examples
4350
add_executable(mnist example/mnist/main.cc example/mnist/dataset.cc example/mnist/net.cc)
44-
target_link_libraries(mnist glog gflags infini_train)
51+
target_link_libraries(mnist glog gflags infini_train Eigen3::Eigen)
4552

4653
add_executable(gpt2 example/gpt2/main.cc example/gpt2/dataset.cc example/gpt2/net.cc)
47-
target_link_libraries(gpt2 glog gflags infini_train)
54+
target_link_libraries(gpt2 glog gflags infini_train Eigen3::Eigen)
4855
else()
4956
include_directories(${PROJECT_SOURCE_DIR})
5057
file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
5158

5259
add_library(infini_train STATIC ${SRC})
53-
target_link_libraries(infini_train glog gflags)
60+
target_link_libraries(infini_train glog gflags Eigen3::Eigen)
5461

5562
# Examples
5663
add_executable(mnist example/mnist/main.cc example/mnist/dataset.cc example/mnist/net.cc)
57-
target_link_libraries(mnist glog gflags infini_train)
64+
target_link_libraries(mnist glog gflags infini_train Eigen3::Eigen)
5865

5966
add_executable(gpt2 example/gpt2/main.cc example/gpt2/dataset.cc example/gpt2/net.cc)
60-
target_link_libraries(gpt2 glog gflags infini_train)
67+
target_link_libraries(gpt2 glog gflags infini_train Eigen3::Eigen)
68+
69+
# OpenBLAS
70+
# target_link_libraries(infini_train ${OpenBLAS_LIBRARIES})
71+
# target_link_libraries(mnist ${OpenBLAS_LIBRARIES})
72+
# target_link_libraries(gpt2 ${OpenBLAS_LIBRARIES})
73+
74+
# OpenMP
75+
target_link_libraries(infini_train OpenMP::OpenMP_CXX)
76+
target_link_libraries(mnist OpenMP::OpenMP_CXX)
77+
target_link_libraries(gpt2 OpenMP::OpenMP_CXX)
6178
endif()

infini_train/include/tensor.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <random>
88
#include <vector>
99

10+
#include "Eigen/Dense"
1011
#include "glog/logging.h"
1112

1213
#include "infini_train/include/device.h"
@@ -69,6 +70,9 @@ class Tensor : public std::enable_shared_from_this<Tensor> {
6970

7071
template <typename T> void Fill(T value);
7172

73+
Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> EigenMatrix();
74+
Eigen::Map<Eigen::Matrix<float, 1, Eigen::Dynamic, Eigen::RowMajor>> EigenVector();
75+
7276
// TODO(dcj): return shared_ptr<Tensor> instead of Tensor later
7377
Tensor To(Device device);
7478

infini_train/src/kernels/cpu/linear.cc

Lines changed: 31 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,11 @@ MatmulBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
106106
std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight,
107107
bool transpose, const std::shared_ptr<Tensor> &bias) {
108108
/*
109-
!transpose: output = input * weight + bias
110-
output[*, out_features] = input[*, in_features] * weight[in_features, out_features] + bias[out_features]
111-
112109
transpose: output = input * weight^T + bias
113110
output[*, out_features] = input[*, in_features] * weight[out_features, in_features]^T + bias[out_features]
111+
112+
!transpose: output = input * weight + bias
113+
output[*, out_features] = input[*, in_features] * weight[in_features, out_features] + bias[out_features]
114114
*/
115115

116116
const auto &input_dims = input->Dims();
@@ -130,24 +130,32 @@ std::shared_ptr<Tensor> LinearForward(const std::shared_ptr<Tensor> &input, cons
130130
auto output_dims = input_dims;
131131
*output_dims.rbegin() = out_features;
132132
auto output = std::make_shared<Tensor>(output_dims, DataType::kFLOAT32);
133-
for (int64_t i = 0; i < bs; ++i) {
134-
for (int64_t j = 0; j < out_features; ++j) {
135-
auto *data_ptr = static_cast<float *>(output->DataPtr()) + i * out_features + j;
136-
*data_ptr = 0.0f;
137-
for (int64_t k = 0; k < in_features; ++k) {
138-
*data_ptr += reinterpret_cast<const float *>(input->DataPtr())[i * in_features + k]
139-
* reinterpret_cast<const float *>(
140-
weight->DataPtr())[transpose ? j * in_features + k : k * out_features + j];
141-
}
142-
*data_ptr += reinterpret_cast<const float *>(bias->DataPtr())[j];
143-
}
133+
134+
if (transpose) {
135+
output->EigenMatrix() = input->EigenMatrix() * weight->EigenMatrix().transpose();
136+
} else {
137+
output->EigenMatrix() = input->EigenMatrix() * weight->EigenMatrix();
144138
}
139+
output->EigenMatrix().rowwise() += bias->EigenVector();
140+
145141
return output;
146142
}
147143

148144
std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
149145
LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &weight, bool transpose,
150146
int64_t out_features, const std::shared_ptr<Tensor> &grad_output) {
147+
/*
148+
transpose: grad_input = grad_output * weight
149+
grad_input[*, in_features] = grad_output[*, out_features] * weight[out_features, in_features]
150+
grad_weight[out_features, in_features] = grad_output[*, out_features]^T * input[*, in_features]
151+
grad_bias[out_features] = grad_output[*, out_features].sum(axis=0)
152+
153+
!transpose: grad_input = grad_output * weight^T
154+
grad_input[*, in_features] = grad_output[_, out_features] * weight[in_features, out_features]^T
155+
grad_weight[in_features, out_features] = input[*, in_features]^T * grad_output[*, out_features]
156+
grad_bias[out_features] = grad_output[*, out_features].sum(axis=0)
157+
*/
158+
151159
const auto &input_dims = input->Dims();
152160
CHECK_GE(input_dims.size(), 2);
153161
const int64_t bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies<int64_t>{});
@@ -160,28 +168,17 @@ LinearBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tenso
160168

161169
auto grad_input = std::make_shared<Tensor>(input_dims, DataType::kFLOAT32);
162170
auto grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
163-
grad_weight->Fill<float>(0.0f);
164171
auto grad_bias = std::make_shared<Tensor>(std::vector<int64_t>{out_features}, DataType::kFLOAT32);
165-
grad_bias->Fill<float>(0.0f);
166-
167-
for (int64_t i = 0; i < bs; ++i) {
168-
for (int64_t j = 0; j < in_features; ++j) {
169-
const auto input_idx = i * in_features + j;
170-
auto *data_ptr = static_cast<float *>(grad_input->DataPtr()) + input_idx;
171-
*data_ptr = 0.0f;
172-
for (int64_t k = 0; k < out_features; ++k) {
173-
const auto weight_idx = transpose ? k * in_features + j : j * out_features + k;
174-
const auto grad = reinterpret_cast<const float *>(grad_output->DataPtr())[i * out_features + k];
175-
*data_ptr += grad * reinterpret_cast<const float *>(weight->DataPtr())[weight_idx];
176-
static_cast<float *>(grad_weight->DataPtr())[weight_idx]
177-
+= grad * reinterpret_cast<const float *>(input->DataPtr())[input_idx];
178-
}
179-
}
180-
for (int64_t k = 0; k < out_features; ++k) {
181-
static_cast<float *>(grad_bias->DataPtr())[k]
182-
+= reinterpret_cast<const float *>(grad_output->DataPtr())[i * out_features + k];
183-
}
172+
173+
if (transpose) {
174+
grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix();
175+
grad_weight->EigenMatrix() = grad_output->EigenMatrix().transpose() * input->EigenMatrix();
176+
} else {
177+
grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix().transpose();
178+
grad_weight->EigenMatrix() = input->EigenMatrix().transpose() * grad_output->EigenMatrix();
184179
}
180+
grad_bias->EigenVector() = grad_output->EigenMatrix().colwise().sum();
181+
185182
return {grad_input, grad_weight, grad_bias};
186183
}
187184
} // namespace infini_train::kernels::cpu

infini_train/src/tensor.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#ifdef USE_CUDA
1111
#include "cuda_runtime_api.h"
1212
#endif
13+
14+
#include "Eigen/Dense"
1315
#include "glog/logging.h"
1416

1517
#include "infini_train/include/autograd/elementwise.h"
@@ -124,6 +126,18 @@ template <typename T> void Tensor::Fill(T value) {
124126

125127
template void Tensor::Fill<float>(float);
126128

129+
Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> Tensor::EigenMatrix() {
130+
const int64_t bs = std::accumulate(dims_.rbegin() + 1, dims_.rend(), 1, std::multiplies<int64_t>());
131+
return Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>(
132+
reinterpret_cast<float *>(DataPtr()), bs, *dims_.rbegin());
133+
}
134+
135+
Eigen::Map<Eigen::Matrix<float, 1, Eigen::Dynamic, Eigen::RowMajor>> Tensor::EigenVector() {
136+
CHECK_EQ(dims_.size(), 1);
137+
return Eigen::Map<Eigen::Matrix<float, 1, Eigen::Dynamic, Eigen::RowMajor>>(reinterpret_cast<float *>(DataPtr()), 1,
138+
dims_[0]);
139+
}
140+
127141
Tensor Tensor::To(Device device) {
128142
if (device == buffer_->GetDevice()) {
129143
auto new_tensor = Tensor(*this, offset_, dims_);

third_party/eigen

Submodule eigen added at 68f4e58

0 commit comments

Comments
 (0)