diff --git a/.gitmodules b/.gitmodules index 6d9c3b96..6d138b58 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "third_party/gflags"] path = third_party/gflags url = git@github.com:gflags/gflags.git +[submodule "third_party/eigen"] + path = third_party/eigen + url = https://gitlab.com/libeigen/eigen.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 7541c30c..5df46ee1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,13 +21,20 @@ set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE) add_subdirectory(third_party/glog) include_directories(${glog_SOURCE_DIR}/src) +# Add eigen +find_package(OpenMP REQUIRED) +# find_package(OpenBLAS REQUIRED) +# include_directories(${OpenBLAS_INCLUDE_DIR}) +add_subdirectory(third_party/eigen) +include_directories(${PROJECT_SOURCE_DIR}/third_party/eigen) +# add_definitions(-DEIGEN_USE_BLAS) + if(USE_CUDA) add_compile_definitions(USE_CUDA=1) enable_language(CUDA) include(FindCUDAToolkit) # enable CUDA-related compilation options - # set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CMAKE_INCLUDE_PATH} -Xcompiler -fPIC --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") include_directories(${PROJECT_SOURCE_DIR}) @@ -37,25 +44,35 @@ if(USE_CUDA) add_library(infini_train STATIC ${SRC}) set_target_properties(infini_train PROPERTIES CUDA_ARCHITECTURES "70;80") - target_link_libraries(infini_train glog gflags CUDA::cudart CUDA::cublas) + target_link_libraries(infini_train glog gflags CUDA::cudart CUDA::cublas Eigen3::Eigen) # Examples add_executable(mnist example/mnist/main.cc example/mnist/dataset.cc example/mnist/net.cc) - target_link_libraries(mnist glog gflags infini_train) + target_link_libraries(mnist glog gflags infini_train Eigen3::Eigen) add_executable(gpt2 example/gpt2/main.cc example/gpt2/dataset.cc example/gpt2/net.cc) - target_link_libraries(gpt2 glog gflags infini_train) + target_link_libraries(gpt2 glog gflags infini_train Eigen3::Eigen) else() include_directories(${PROJECT_SOURCE_DIR}) file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc) add_library(infini_train STATIC ${SRC}) - target_link_libraries(infini_train glog gflags) + target_link_libraries(infini_train glog gflags Eigen3::Eigen) # Examples add_executable(mnist example/mnist/main.cc example/mnist/dataset.cc example/mnist/net.cc) - target_link_libraries(mnist glog gflags infini_train) + target_link_libraries(mnist glog gflags infini_train Eigen3::Eigen) add_executable(gpt2 example/gpt2/main.cc example/gpt2/dataset.cc example/gpt2/net.cc) - target_link_libraries(gpt2 glog gflags infini_train) + target_link_libraries(gpt2 glog gflags infini_train Eigen3::Eigen) + + # OpenBLAS + # target_link_libraries(infini_train ${OpenBLAS_LIBRARIES}) + # target_link_libraries(mnist ${OpenBLAS_LIBRARIES}) + # target_link_libraries(gpt2 ${OpenBLAS_LIBRARIES}) + + # OpenMP + target_link_libraries(infini_train OpenMP::OpenMP_CXX) + target_link_libraries(mnist OpenMP::OpenMP_CXX) + target_link_libraries(gpt2 OpenMP::OpenMP_CXX) endif() diff --git a/infini_train/include/tensor.h b/infini_train/include/tensor.h index 968dca5a..e88d5eeb 100644 --- a/infini_train/include/tensor.h +++ b/infini_train/include/tensor.h @@ -7,6 +7,7 @@ #include #include +#include "Eigen/Dense" #include "glog/logging.h" #include "infini_train/include/device.h" @@ -69,6 +70,9 @@ class Tensor : public std::enable_shared_from_this { template void Fill(T value); + Eigen::Map> EigenMatrix(); + Eigen::Map> EigenVector(); + // TODO(dcj): return shared_ptr instead of Tensor later Tensor To(Device device); diff --git a/infini_train/src/kernels/cpu/linear.cc b/infini_train/src/kernels/cpu/linear.cc index bc740f74..52374faa 100644 --- a/infini_train/src/kernels/cpu/linear.cc +++ b/infini_train/src/kernels/cpu/linear.cc @@ -106,11 +106,11 @@ MatmulBackward(const std::shared_ptr &input, const std::shared_ptr LinearForward(const std::shared_ptr &input, const std::shared_ptr &weight, bool transpose, const std::shared_ptr &bias) { /* - !transpose: output = input * weight + bias - output[*, out_features] = input[*, in_features] * weight[in_features, out_features] + bias[out_features] - transpose: output = input * weight^T + bias output[*, out_features] = input[*, in_features] * weight[out_features, in_features]^T + bias[out_features] + + !transpose: output = input * weight + bias + output[*, out_features] = input[*, in_features] * weight[in_features, out_features] + bias[out_features] */ const auto &input_dims = input->Dims(); @@ -130,24 +130,32 @@ std::shared_ptr LinearForward(const std::shared_ptr &input, cons auto output_dims = input_dims; *output_dims.rbegin() = out_features; auto output = std::make_shared(output_dims, DataType::kFLOAT32); - for (int64_t i = 0; i < bs; ++i) { - for (int64_t j = 0; j < out_features; ++j) { - auto *data_ptr = static_cast(output->DataPtr()) + i * out_features + j; - *data_ptr = 0.0f; - for (int64_t k = 0; k < in_features; ++k) { - *data_ptr += reinterpret_cast(input->DataPtr())[i * in_features + k] - * reinterpret_cast( - weight->DataPtr())[transpose ? j * in_features + k : k * out_features + j]; - } - *data_ptr += reinterpret_cast(bias->DataPtr())[j]; - } + + if (transpose) { + output->EigenMatrix() = input->EigenMatrix() * weight->EigenMatrix().transpose(); + } else { + output->EigenMatrix() = input->EigenMatrix() * weight->EigenMatrix(); } + output->EigenMatrix().rowwise() += bias->EigenVector(); + return output; } std::tuple, std::shared_ptr, std::shared_ptr> LinearBackward(const std::shared_ptr &input, const std::shared_ptr &weight, bool transpose, int64_t out_features, const std::shared_ptr &grad_output) { + /* + transpose: grad_input = grad_output * weight + grad_input[*, in_features] = grad_output[*, out_features] * weight[out_features, in_features] + grad_weight[out_features, in_features] = grad_output[*, out_features]^T * input[*, in_features] + grad_bias[out_features] = grad_output[*, out_features].sum(axis=0) + + !transpose: grad_input = grad_output * weight^T + grad_input[*, in_features] = grad_output[_, out_features] * weight[in_features, out_features]^T + grad_weight[in_features, out_features] = input[*, in_features]^T * grad_output[*, out_features] + grad_bias[out_features] = grad_output[*, out_features].sum(axis=0) + */ + const auto &input_dims = input->Dims(); CHECK_GE(input_dims.size(), 2); const int64_t bs = std::accumulate(input_dims.rbegin() + 1, input_dims.rend(), 1, std::multiplies{}); @@ -160,28 +168,17 @@ LinearBackward(const std::shared_ptr &input, const std::shared_ptr(input_dims, DataType::kFLOAT32); auto grad_weight = std::make_shared(weight_dims, DataType::kFLOAT32); - grad_weight->Fill(0.0f); auto grad_bias = std::make_shared(std::vector{out_features}, DataType::kFLOAT32); - grad_bias->Fill(0.0f); - - for (int64_t i = 0; i < bs; ++i) { - for (int64_t j = 0; j < in_features; ++j) { - const auto input_idx = i * in_features + j; - auto *data_ptr = static_cast(grad_input->DataPtr()) + input_idx; - *data_ptr = 0.0f; - for (int64_t k = 0; k < out_features; ++k) { - const auto weight_idx = transpose ? k * in_features + j : j * out_features + k; - const auto grad = reinterpret_cast(grad_output->DataPtr())[i * out_features + k]; - *data_ptr += grad * reinterpret_cast(weight->DataPtr())[weight_idx]; - static_cast(grad_weight->DataPtr())[weight_idx] - += grad * reinterpret_cast(input->DataPtr())[input_idx]; - } - } - for (int64_t k = 0; k < out_features; ++k) { - static_cast(grad_bias->DataPtr())[k] - += reinterpret_cast(grad_output->DataPtr())[i * out_features + k]; - } + + if (transpose) { + grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix(); + grad_weight->EigenMatrix() = grad_output->EigenMatrix().transpose() * input->EigenMatrix(); + } else { + grad_input->EigenMatrix() = grad_output->EigenMatrix() * weight->EigenMatrix().transpose(); + grad_weight->EigenMatrix() = input->EigenMatrix().transpose() * grad_output->EigenMatrix(); } + grad_bias->EigenVector() = grad_output->EigenMatrix().colwise().sum(); + return {grad_input, grad_weight, grad_bias}; } } // namespace infini_train::kernels::cpu diff --git a/infini_train/src/tensor.cc b/infini_train/src/tensor.cc index 71fef273..509991e2 100644 --- a/infini_train/src/tensor.cc +++ b/infini_train/src/tensor.cc @@ -10,6 +10,8 @@ #ifdef USE_CUDA #include "cuda_runtime_api.h" #endif + +#include "Eigen/Dense" #include "glog/logging.h" #include "infini_train/include/autograd/elementwise.h" @@ -124,6 +126,18 @@ template void Tensor::Fill(T value) { template void Tensor::Fill(float); +Eigen::Map> Tensor::EigenMatrix() { + const int64_t bs = std::accumulate(dims_.rbegin() + 1, dims_.rend(), 1, std::multiplies()); + return Eigen::Map>( + reinterpret_cast(DataPtr()), bs, *dims_.rbegin()); +} + +Eigen::Map> Tensor::EigenVector() { + CHECK_EQ(dims_.size(), 1); + return Eigen::Map>(reinterpret_cast(DataPtr()), 1, + dims_[0]); +} + Tensor Tensor::To(Device device) { if (device == buffer_->GetDevice()) { auto new_tensor = Tensor(*this, offset_, dims_); diff --git a/third_party/eigen b/third_party/eigen new file mode 160000 index 00000000..68f4e58c --- /dev/null +++ b/third_party/eigen @@ -0,0 +1 @@ +Subproject commit 68f4e58cfacc686583d16cff90361f0b43bc2c1b