Merge pull request #538 from abergeron/test

nouiz · web-flow · commit 1c1e068b3ae6 · 2017-10-03T15:15:23.000-04:00
Add support for Tensor Cores in the BLAS bindings.
diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c
@@ -72,6 +72,7 @@ typedef struct _blas_handle {
   GpuKernel dgemvBH_T_a1_b1_small;
   GpuKernel sgerBH_gen_small;
   GpuKernel dgerBH_gen_small;
+  uint8_t tensorCore;
 } blas_handle;
 
 #define LARGE_VAL(v) (v >= INT_MAX)
@@ -199,8 +200,10 @@ static const char *code_dgerBH_gen_small =                              \
 static int setup(gpucontext *c) {
   cuda_context *ctx = (cuda_context *)c;
   blas_handle *handle;
+  CUdevice dev;
   cublasStatus_t err;
   int types[10];
+  int major, minor;
   int e;
 
   if (ctx->blas_handle != NULL)
@@ -211,6 +214,23 @@ static int setup(gpucontext *c) {
     return error_sys(ctx->err, "calloc");
 
   cuda_enter(ctx);
+  {
+    CUresult err;
+    err = cuCtxGetDevice(&dev);
+    if (err != CUDA_SUCCESS) {
+      cuda_exit(ctx);
+      return error_cuda(ctx->err, "cuCtxGetDevice", err);
+    }
+  }
+  GA_CUDA_EXIT_ON_ERROR(ctx, get_cc(dev, &major, &minor, ctx->err));
+
+  /* Only try to use tensor core on cuda 9 and up */
+  if (ctx->major >= 9 && major >= 7 && minor >= 0) {
+    handle->tensorCore = 1;
+  } else {
+    handle->tensorCore = 0;
+  }
+
   err = cublasCreate(&handle->h);
   if (err != CUBLAS_STATUS_SUCCESS) {
     cuda_exit(ctx);
@@ -443,8 +463,8 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   ASSERT_BUF(B);
   ASSERT_BUF(C);
 
-  if (cublasSgemmEx == NULL)
-    return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmEx unavailable");
+  if (cublasSgemmEx == NULL && (cublasGemmEx == NULL || h->tensorCore == 0))
+    return error_set(ctx->err, GA_DEVSUP_ERROR, "cublasSgemmEx|cublasGemmEx unavailable");
 
   if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
       LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
@@ -476,15 +496,29 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));
 
-  CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemmEx(h->h, convT(transA), convT(transB),
-                                          M, N, K,
-                                          &alpha, ((uint16_t *)A->ptr) + offA,
-                                          CUDA_R_16F,
-                                          lda, ((uint16_t *)B->ptr) + offB,
-                                          CUDA_R_16F,
-                                          ldb, &beta, ((uint16_t *)C->ptr) + offC,
-                                          CUDA_R_16F,
-                                          ldc));
+  if (cublasGemmEx != NULL && h->tensorCore) {
+    CUBLAS_EXIT_ON_ERROR(ctx, cublasGemmEx(h->h, convT(transA), convT(transB),
+					   M, N, K,
+					   &alpha, ((uint16_t *)A->ptr) + offA,
+					   CUDA_R_16F,
+					   lda, ((uint16_t *)B->ptr) + offB,
+					   CUDA_R_16F,
+					   ldb, &beta, ((uint16_t *)C->ptr) + offC,
+					   CUDA_R_16F,
+					   ldc,
+					   CUDA_R_32F,
+					   CUBLAS_GEMM_DFALT_TENSOR_OP));
+  } else {
+    CUBLAS_EXIT_ON_ERROR(ctx, cublasSgemmEx(h->h, convT(transA), convT(transB),
+					    M, N, K,
+					    &alpha, ((uint16_t *)A->ptr) + offA,
+					    CUDA_R_16F,
+					    lda, ((uint16_t *)B->ptr) + offB,
+					    CUDA_R_16F,
+					    ldb, &beta, ((uint16_t *)C->ptr) + offC,
+					    CUDA_R_16F,
+					    ldc));
+  }
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
@@ -1048,7 +1048,7 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) {
     return GA_NO_ERROR;
 }
 
-static int get_cc(CUdevice dev, int *maj, int *min, error *e) {
+int get_cc(CUdevice dev, int *maj, int *min, error *e) {
   CUresult err;
   err = cuDeviceGetAttribute(maj,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
diff --git a/src/loaders/libcublas.fn b/src/loaders/libcublas.fn
@@ -21,6 +21,8 @@ DEF_PROC_V2(cublasDger, (cublasHandle_t handle, int m, int n, const double *alph
 
 DEF_PROC_OPT(cublasSgemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const float *beta, void *C, cudaDataType Ctype, int ldc));
 
+DEF_PROC_OPT(cublasGemmEx, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType, cublasGemmAlgo_t algo));
+
 DEF_PROC(cublasSgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *Aarray[], int lda, const float *Barray[], int ldb, const float *beta, float *Carray[], int ldc, int batchCount));
 DEF_PROC(cublasDgemmBatched, (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *Aarray[], int lda, const double *Barray[], int ldb, const double *beta, double *Carray[], int ldc, int batchCount));
 
diff --git a/src/loaders/libcublas.h b/src/loaders/libcublas.h
@@ -34,6 +34,34 @@ typedef enum cudaDataType_t
   CUDA_C_32U= 13   // complex as a pair of unsigned int numbers
 } cudaDataType;
 
+typedef cudaDataType cudaDataType_t;
+
+typedef enum {
+    CUBLAS_GEMM_DFALT               = -1,
+    CUBLAS_GEMM_ALGO0               =  0,
+    CUBLAS_GEMM_ALGO1               =  1,
+    CUBLAS_GEMM_ALGO2               =  2,
+    CUBLAS_GEMM_ALGO3               =  3,
+    CUBLAS_GEMM_ALGO4               =  4,
+    CUBLAS_GEMM_ALGO5               =  5,
+    CUBLAS_GEMM_ALGO6               =  6,
+    CUBLAS_GEMM_ALGO7               =  7,
+    CUBLAS_GEMM_ALGO8               =  8,
+    CUBLAS_GEMM_ALGO9               =  9,
+    CUBLAS_GEMM_ALGO10              =  10,
+    CUBLAS_GEMM_ALGO11              =  11,
+    CUBLAS_GEMM_ALGO12              =  12,
+    CUBLAS_GEMM_ALGO13              =  13,
+    CUBLAS_GEMM_ALGO14              =  14,
+    CUBLAS_GEMM_ALGO15              =  15,
+    CUBLAS_GEMM_ALGO16              =  16,
+    CUBLAS_GEMM_ALGO17              =  17,
+    CUBLAS_GEMM_DFALT_TENSOR_OP     =  99,
+    CUBLAS_GEMM_ALGO0_TENSOR_OP     =  100,
+    CUBLAS_GEMM_ALGO1_TENSOR_OP     =  101,
+    CUBLAS_GEMM_ALGO2_TENSOR_OP     =  102
+} cublasGemmAlgo_t;
+
 typedef struct CUstream_st *cudaStream_t;
 
 typedef enum {
diff --git a/src/private_cuda.h b/src/private_cuda.h
@@ -157,4 +157,6 @@ struct _gpukernel {
 #endif
 };
 
+int get_cc(CUdevice dev, int *maj, int *min, error *e);
+
 #endif

Original file line number	Diff line number	Diff line change
`@@ -1048,7 +1048,7 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) {`
`1048`	`1048`	`return GA_NO_ERROR;`
`1049`	`1049`	`}`
`1050`	`1050`
`1051`		`-static int get_cc(CUdevice dev, int maj, int min, error *e) {`
	`1051`	`+int get_cc(CUdevice dev, int maj, int min, error *e) {`
`1052`	`1052`	`CUresult err;`
`1053`	`1053`	`err = cuDeviceGetAttribute(maj,`
`1054`	`1054`	`CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,`
-Original file line number
+Diff line change
 #endif
 };
 +int get_cc(CUdevice dev, int *maj, int *min, error *e);
++
 #endif