Math Library Blog Samples (#43)

gabrielle9talavera · web-flow · commit 36043fe1a27b · 2022-05-31T22:07:28.000+10:00
diff --git a/posts/math-libraries-intro/Makefile b/posts/math-libraries-intro/Makefile
@@ -0,0 +1,42 @@
+# Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+BINARIES= cublas-example openblas-example
+
+all: $(BINARIES)
+
+openblas-example: openblas-example.c Makefile
+	g++ -std=c++11 -pthread -O3 -Wall openblas-example.c -o openblas-example -lopenblas -lpthread -lm
+
+cublas-example: cublas-example.cu Makefile
+	nvcc -lcublas -std=c++11 cublas-example.cu  -o cublas-example 
+	
+clean:
+	rm -f *.o $(BINARIES)
+	
+run: $(BINARIES)
+	./openblas-example
+	./cublas-example
diff --git a/posts/math-libraries-intro/README.md b/posts/math-libraries-intro/README.md
@@ -0,0 +1,128 @@
+# GEMM Examples
+
+============================
+
+Matrix Multiplication performed using OpenBLAS and cuBLAS.
+
+## Getting Started
+
+============================
+
+### Packages Used
+
+- CUDA Toolkit 11.3
+- OpenBLAS 0.2.19
+
+### Hardware Specifications
+
+**CPU:**
+Intel(R) Xeon(R) CPU E5-2698 v3 @ 2.30GHz
+
+**GPU:**
+Tesla V100-PCIE 32GB
+
+#### Set Environment Variables
+
+`export OPENBLAS_NUM_THREADS=32`
+
+#### Set GPU Clocks
+
+- Copy the code block below into the `.bashrc` file
+
+``` bash
+scs () {
+
+        module load cuda/11.2.1
+
+        DATE=$(date +"%m%d%y-%H%M%S")
+        G_NAME=$(nvidia-smi -i 0 --query-gpu=gpu_name --format=csv,nounits,noheader | sed 's/ /-/g')
+        G_CLK=$(nvidia-smi -i 0 --query-gpu=clocks.max.sm --format=csv,nounits,noheader)
+        M_CLK=$(nvidia-smi -i 0 --query-gpu=clocks.max.memory --format=csv,nounits,noheader)
+        P_LIMIT=$(nvidia-smi -i 0 --query-gpu=power.max_limit --format=csv,nounits,noheader)
+        DRIVER=$(nvidia-smi -i 0 --query-gpu=driver_version --format=csv,nounits,noheader)
+
+        sudo nvidia-smi
+        sudo nvidia-smi -pm ENABLED
+        sudo nvidia-smi --auto-boost-default=0
+        sudo nvidia-smi -ac ${M_CLK},${G_CLK}
+        sudo nvidia-smi -lgc ${G_CLK},${G_CLK}
+        sudo nvidia-smi -pl ${P_LIMIT}
+        sudo nvidia-smi -q -d POWER,CLOCK
+}
+```
+
+- Update with `source ~/.bashrc`
+- Run `scs` to execute
+
+## Running Examples
+
+============================
+
+### Build
+
+`git clone the repo`
+
+`cd code-samples/posts/math-libraries-intro`
+
+`make`
+
+### OpenBLAS
+
+#### Run
+
+`./openblas-example`
+
+#### Sample Output
+
+=============================
+
+```text
+
+  This example computes real matrix C=alpha*A*B+beta*C using
+ OpenBLAS dgemm, where A, B, and  C are matrices and
+ alpha and beta are double precision scalars
+
+ Initializing data for matrix multiplication C=A*B for matrix
+ A(4092x4092) and matrix B(4092x4092)
+
+ Allocating memory for matrices aligned on 64-byte boundary for better
+ performance
+
+ Time Elapsed: 414.35 ms
+
+ Example completed.
+
+```
+
+### cuBLAS
+
+#### Run
+
+`./cublas-example`
+
+#### Sample Output
+
+=============================
+
+```text
+
+ This example computes real matrix C=alpha*A*B+beta*C using
+ cuBLAS dgemm, where A, B, and  C are matrices and
+ alpha and beta are double precision scalars
+
+ Initializing data for matrix multiplication C=A*B for matrix
+ A(4092x4092) and matrix B(4092x4092)
+
+ Allocating memory for matrices aligned on 64-byte boundary for better
+ performance
+
+ Computing matrix product using cuBLAS dgemm function
+
+ Computations completed.
+
+ Time Elapsed: 19.80 ms
+
+ Deallocating memory
+
+ Example completed.
+```
diff --git a/posts/math-libraries-intro/cublas-example.cu b/posts/math-libraries-intro/cublas-example.cu
@@ -0,0 +1,107 @@
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <vector>
+
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+// CUDA API error checking
+#define CUDA_CHECK(err)                                                                            \
+    do {                                                                                           \
+        cudaError_t err_ = (err);                                                                  \
+        if (err_ != cudaSuccess) {                                                                 \
+            std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__);                     \
+            throw std::runtime_error("CUDA error");                                                \
+        }                                                                                          \
+    } while (0)
+
+// cublas API error checking
+#define CUBLAS_CHECK(err)                                                                          \
+    do {                                                                                           \
+        cublasStatus_t err_ = (err);                                                               \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                                       \
+            std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);                   \
+            throw std::runtime_error("cublas error");                                              \
+        }                                                                                          \
+    } while (0)
+
+int main(int argc, char *argv[]) {
+    cublasHandle_t cublasH = NULL;
+    cudaStream_t stream = NULL;
+
+    cudaEvent_t startEvent { nullptr };
+    cudaEvent_t stopEvent { nullptr };
+    float       elapsed_gpu_ms {};
+    
+    int m, n, k;
+    int lda, ldb, ldc;
+    double alpha, beta;
+
+    printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
+            " cuBLAS dgemm, where A, B, and  C are matrices and \n"
+            " alpha and beta are double precision scalars\n\n");
+
+    int size = 4092;
+    m = size, k = size, n = size;
+    lda = size, ldb = size, ldc = size;
+    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
+        " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
+    alpha = 1.0, beta = 0.0;
+
+    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
+            " performance \n\n");
+
+    double *d_A = nullptr;
+    double *d_B = nullptr;
+    double *d_C = nullptr;
+
+    /* step 1: create cublas handle, bind a stream */
+    CUBLAS_CHECK(cublasCreate(&cublasH));
+
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CUBLAS_CHECK(cublasSetStream(cublasH, stream));
+
+    /* step 2: copy data to device */
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(double) * m * k));
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_B), sizeof(double) * k * n));
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_C), sizeof(double) * m * n));
+
+    /* step 3: compute */
+    printf (" Computing matrix product using cuBLAS dgemm function \n\n");
+
+    cudaEventCreate( &startEvent, cudaEventBlockingSync );
+    cudaEventRecord( startEvent );
+
+    for (int i =0; i< 10; i++)
+    CUBLAS_CHECK(
+        cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    cudaEventCreate( &stopEvent, cudaEventBlockingSync );
+    cudaEventRecord( stopEvent );
+    cudaEventSynchronize( stopEvent );
+
+    printf ("\n Computations completed.\n\n");
+
+    cudaEventElapsedTime( &elapsed_gpu_ms, startEvent, stopEvent );
+    printf( " Time Elapsed: %0.2f ms \n\n", elapsed_gpu_ms/10);
+
+    /* free resources */
+    printf ("\n Deallocating memory \n\n");
+    CUDA_CHECK(cudaFree(d_A));
+    CUDA_CHECK(cudaFree(d_B));
+    CUDA_CHECK(cudaFree(d_C));
+
+    CUBLAS_CHECK(cublasDestroy(cublasH));
+
+    CUDA_CHECK(cudaStreamDestroy(stream));
+
+    CUDA_CHECK(cudaDeviceReset());
+
+    printf (" Example completed. \n\n");
+    return 0;
+}
diff --git a/posts/math-libraries-intro/openblas-example.cpp b/posts/math-libraries-intro/openblas-example.cpp
@@ -0,0 +1,47 @@
+#include <cblas.h>
+#include <stdio.h>
+#include <vector>
+#include <chrono>
+
+int main(int argc, char *argv[]){
+    int m, n, k;
+    int lda, ldb, ldc;
+    double alpha, beta;
+    int l, loops;
+
+    std::chrono::high_resolution_clock::time_point start {};
+    std::chrono::high_resolution_clock::time_point stop {};
+    std::chrono::duration<double, std::milli>      elapsed_cpu_ms {};
+
+    printf("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
+        " OpenBLAS dgemm, where A, B, and  C are matrices and \n"
+        " alpha and beta are double precision scalars\n\n");
+
+    int size = 4092;
+    m = size, k = size, n = size;
+    lda = size, ldb = size, ldc = size;
+    alpha = 1.0, beta = 0.0;
+    loops = 10;
+
+    printf(" Initializing data for matrix multiplication C=A*B for matrix \n"
+            " A(%ix%i) and matrix B(%ix%i)\n\n",
+            m, k, k, n);
+
+    start = std::chrono::high_resolution_clock::now( );
+    printf(" Allocating memory for matrices aligned on 64-byte boundary for better \n"
+            " performance \n\n");
+    std::vector<double> A(m * k);
+    std::vector<double> B(k * n);
+    std::vector<double> C(m * n);
+
+    for (l = 0; l < loops; l++)
+        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
+    
+    stop = std::chrono::high_resolution_clock::now( );
+    elapsed_cpu_ms = stop - start;
+
+    printf(" Time Elapsed: %0.2f ms \n\n", elapsed_cpu_ms.count( ) / loops);
+    printf(" Example completed. \n\n");
+
+    return 0;
+}