Skip to content

Commit 36043fe

Browse files
Math Library Blog Samples (#43)
1 parent 2a1d364 commit 36043fe

4 files changed

Lines changed: 324 additions & 0 deletions

File tree

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright (c) 1993-2022, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
BINARIES= cublas-example openblas-example
28+
29+
all: $(BINARIES)
30+
31+
openblas-example: openblas-example.c Makefile
32+
g++ -std=c++11 -pthread -O3 -Wall openblas-example.c -o openblas-example -lopenblas -lpthread -lm
33+
34+
cublas-example: cublas-example.cu Makefile
35+
nvcc -lcublas -std=c++11 cublas-example.cu -o cublas-example
36+
37+
clean:
38+
rm -f *.o $(BINARIES)
39+
40+
run: $(BINARIES)
41+
./openblas-example
42+
./cublas-example
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# GEMM Examples
2+
3+
============================
4+
5+
Matrix Multiplication performed using OpenBLAS and cuBLAS.
6+
7+
## Getting Started
8+
9+
============================
10+
11+
### Packages Used
12+
13+
- CUDA Toolkit 11.3
14+
- OpenBLAS 0.2.19
15+
16+
### Hardware Specifications
17+
18+
**CPU:**
19+
Intel(R) Xeon(R) CPU E5-2698 v3 @ 2.30GHz
20+
21+
**GPU:**
22+
Tesla V100-PCIE 32GB
23+
24+
#### Set Environment Variables
25+
26+
`export OPENBLAS_NUM_THREADS=32`
27+
28+
#### Set GPU Clocks
29+
30+
- Copy the code block below into the `.bashrc` file
31+
32+
``` bash
33+
scs () {
34+
35+
module load cuda/11.2.1
36+
37+
DATE=$(date +"%m%d%y-%H%M%S")
38+
G_NAME=$(nvidia-smi -i 0 --query-gpu=gpu_name --format=csv,nounits,noheader | sed 's/ /-/g')
39+
G_CLK=$(nvidia-smi -i 0 --query-gpu=clocks.max.sm --format=csv,nounits,noheader)
40+
M_CLK=$(nvidia-smi -i 0 --query-gpu=clocks.max.memory --format=csv,nounits,noheader)
41+
P_LIMIT=$(nvidia-smi -i 0 --query-gpu=power.max_limit --format=csv,nounits,noheader)
42+
DRIVER=$(nvidia-smi -i 0 --query-gpu=driver_version --format=csv,nounits,noheader)
43+
44+
sudo nvidia-smi
45+
sudo nvidia-smi -pm ENABLED
46+
sudo nvidia-smi --auto-boost-default=0
47+
sudo nvidia-smi -ac ${M_CLK},${G_CLK}
48+
sudo nvidia-smi -lgc ${G_CLK},${G_CLK}
49+
sudo nvidia-smi -pl ${P_LIMIT}
50+
sudo nvidia-smi -q -d POWER,CLOCK
51+
}
52+
```
53+
54+
- Update with `source ~/.bashrc`
55+
- Run `scs` to execute
56+
57+
## Running Examples
58+
59+
============================
60+
61+
### Build
62+
63+
`git clone the repo`
64+
65+
`cd code-samples/posts/math-libraries-intro`
66+
67+
`make`
68+
69+
### OpenBLAS
70+
71+
#### Run
72+
73+
`./openblas-example`
74+
75+
#### Sample Output
76+
77+
=============================
78+
79+
```text
80+
81+
This example computes real matrix C=alpha*A*B+beta*C using
82+
OpenBLAS dgemm, where A, B, and C are matrices and
83+
alpha and beta are double precision scalars
84+
85+
Initializing data for matrix multiplication C=A*B for matrix
86+
A(4092x4092) and matrix B(4092x4092)
87+
88+
Allocating memory for matrices aligned on 64-byte boundary for better
89+
performance
90+
91+
Time Elapsed: 414.35 ms
92+
93+
Example completed.
94+
95+
```
96+
97+
### cuBLAS
98+
99+
#### Run
100+
101+
`./cublas-example`
102+
103+
#### Sample Output
104+
105+
=============================
106+
107+
```text
108+
109+
This example computes real matrix C=alpha*A*B+beta*C using
110+
cuBLAS dgemm, where A, B, and C are matrices and
111+
alpha and beta are double precision scalars
112+
113+
Initializing data for matrix multiplication C=A*B for matrix
114+
A(4092x4092) and matrix B(4092x4092)
115+
116+
Allocating memory for matrices aligned on 64-byte boundary for better
117+
performance
118+
119+
Computing matrix product using cuBLAS dgemm function
120+
121+
Computations completed.
122+
123+
Time Elapsed: 19.80 ms
124+
125+
Deallocating memory
126+
127+
Example completed.
128+
```
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#include <chrono>
2+
#include <cstdio>
3+
#include <cstdlib>
4+
#include <stdexcept>
5+
#include <vector>
6+
7+
8+
#include <cublas_v2.h>
9+
#include <cuda_runtime.h>
10+
11+
// CUDA API error checking
12+
#define CUDA_CHECK(err) \
13+
do { \
14+
cudaError_t err_ = (err); \
15+
if (err_ != cudaSuccess) { \
16+
std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__); \
17+
throw std::runtime_error("CUDA error"); \
18+
} \
19+
} while (0)
20+
21+
// cublas API error checking
22+
#define CUBLAS_CHECK(err) \
23+
do { \
24+
cublasStatus_t err_ = (err); \
25+
if (err_ != CUBLAS_STATUS_SUCCESS) { \
26+
std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \
27+
throw std::runtime_error("cublas error"); \
28+
} \
29+
} while (0)
30+
31+
int main(int argc, char *argv[]) {
32+
cublasHandle_t cublasH = NULL;
33+
cudaStream_t stream = NULL;
34+
35+
cudaEvent_t startEvent { nullptr };
36+
cudaEvent_t stopEvent { nullptr };
37+
float elapsed_gpu_ms {};
38+
39+
int m, n, k;
40+
int lda, ldb, ldc;
41+
double alpha, beta;
42+
43+
printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
44+
" cuBLAS dgemm, where A, B, and C are matrices and \n"
45+
" alpha and beta are double precision scalars\n\n");
46+
47+
int size = 4092;
48+
m = size, k = size, n = size;
49+
lda = size, ldb = size, ldc = size;
50+
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
51+
" A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
52+
alpha = 1.0, beta = 0.0;
53+
54+
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
55+
" performance \n\n");
56+
57+
double *d_A = nullptr;
58+
double *d_B = nullptr;
59+
double *d_C = nullptr;
60+
61+
/* step 1: create cublas handle, bind a stream */
62+
CUBLAS_CHECK(cublasCreate(&cublasH));
63+
64+
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
65+
CUBLAS_CHECK(cublasSetStream(cublasH, stream));
66+
67+
/* step 2: copy data to device */
68+
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(double) * m * k));
69+
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_B), sizeof(double) * k * n));
70+
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_C), sizeof(double) * m * n));
71+
72+
/* step 3: compute */
73+
printf (" Computing matrix product using cuBLAS dgemm function \n\n");
74+
75+
cudaEventCreate( &startEvent, cudaEventBlockingSync );
76+
cudaEventRecord( startEvent );
77+
78+
for (int i =0; i< 10; i++)
79+
CUBLAS_CHECK(
80+
cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));
81+
82+
CUDA_CHECK(cudaStreamSynchronize(stream));
83+
84+
cudaEventCreate( &stopEvent, cudaEventBlockingSync );
85+
cudaEventRecord( stopEvent );
86+
cudaEventSynchronize( stopEvent );
87+
88+
printf ("\n Computations completed.\n\n");
89+
90+
cudaEventElapsedTime( &elapsed_gpu_ms, startEvent, stopEvent );
91+
printf( " Time Elapsed: %0.2f ms \n\n", elapsed_gpu_ms/10);
92+
93+
/* free resources */
94+
printf ("\n Deallocating memory \n\n");
95+
CUDA_CHECK(cudaFree(d_A));
96+
CUDA_CHECK(cudaFree(d_B));
97+
CUDA_CHECK(cudaFree(d_C));
98+
99+
CUBLAS_CHECK(cublasDestroy(cublasH));
100+
101+
CUDA_CHECK(cudaStreamDestroy(stream));
102+
103+
CUDA_CHECK(cudaDeviceReset());
104+
105+
printf (" Example completed. \n\n");
106+
return 0;
107+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#include <cblas.h>
2+
#include <stdio.h>
3+
#include <vector>
4+
#include <chrono>
5+
6+
int main(int argc, char *argv[]){
7+
int m, n, k;
8+
int lda, ldb, ldc;
9+
double alpha, beta;
10+
int l, loops;
11+
12+
std::chrono::high_resolution_clock::time_point start {};
13+
std::chrono::high_resolution_clock::time_point stop {};
14+
std::chrono::duration<double, std::milli> elapsed_cpu_ms {};
15+
16+
printf("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
17+
" OpenBLAS dgemm, where A, B, and C are matrices and \n"
18+
" alpha and beta are double precision scalars\n\n");
19+
20+
int size = 4092;
21+
m = size, k = size, n = size;
22+
lda = size, ldb = size, ldc = size;
23+
alpha = 1.0, beta = 0.0;
24+
loops = 10;
25+
26+
printf(" Initializing data for matrix multiplication C=A*B for matrix \n"
27+
" A(%ix%i) and matrix B(%ix%i)\n\n",
28+
m, k, k, n);
29+
30+
start = std::chrono::high_resolution_clock::now( );
31+
printf(" Allocating memory for matrices aligned on 64-byte boundary for better \n"
32+
" performance \n\n");
33+
std::vector<double> A(m * k);
34+
std::vector<double> B(k * n);
35+
std::vector<double> C(m * n);
36+
37+
for (l = 0; l < loops; l++)
38+
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
39+
40+
stop = std::chrono::high_resolution_clock::now( );
41+
elapsed_cpu_ms = stop - start;
42+
43+
printf(" Time Elapsed: %0.2f ms \n\n", elapsed_cpu_ms.count( ) / loops);
44+
printf(" Example completed. \n\n");
45+
46+
return 0;
47+
}

0 commit comments

Comments
 (0)