Actually check for the compute capability of the current device and only ask for tensor core when it is available.

abergeron · abergeron · commit cb1219abb19d · 2017-09-29T17:48:43.000-04:00
diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c
@@ -200,8 +200,10 @@ static const char *code_dgerBH_gen_small =                              \
 static int setup(gpucontext *c) {
   cuda_context *ctx = (cuda_context *)c;
   blas_handle *handle;
+  CUdevice dev;
   cublasStatus_t err;
   int types[10];
+  int major, minor;
   int e;
 
   if (ctx->blas_handle != NULL)
@@ -211,14 +213,24 @@ static int setup(gpucontext *c) {
   if (handle == NULL)
     return error_sys(ctx->err, "calloc");
 
+  cuda_enter(ctx);
+  {
+    CUresult err;
+    err = cuCtxGetDevice(&dev);
+    if (err != CUDA_SUCCESS) {
+      cuda_exit(ctx);
+      return error_cuda(ctx->err, "cuCtxGetDevice", err);
+    }
+  }
+  GA_CUDA_EXIT_ON_ERROR(ctx, get_cc(dev, &major, &minor, ctx->err));
+
   /* Only try to use tensor core on cuda 9 and up */
-  if (ctx->major >= 9) {
+  if (ctx->major >= 9 && major >= 7 && minor >= 0) {
     handle->tensorCore = 1;
   } else {
     handle->tensorCore = 0;
   }
 
-  cuda_enter(ctx);
   err = cublasCreate(&handle->h);
   if (err != CUBLAS_STATUS_SUCCESS) {
     cuda_exit(ctx);
@@ -507,7 +519,7 @@ static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
 					    CUDA_R_16F,
 					    ldc));
   }
-    
+
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL));
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
@@ -1048,7 +1048,7 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) {
     return GA_NO_ERROR;
 }
 
-static int get_cc(CUdevice dev, int *maj, int *min, error *e) {
+int get_cc(CUdevice dev, int *maj, int *min, error *e) {
   CUresult err;
   err = cuDeviceGetAttribute(maj,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
diff --git a/src/private_cuda.h b/src/private_cuda.h
@@ -157,4 +157,6 @@ struct _gpukernel {
 #endif
 };
 
+int get_cc(CUdevice dev, int *maj, int *min, error *e);
+
 #endif

Original file line number	Diff line number	Diff line change
`@@ -1048,7 +1048,7 @@ static int cuda_memset(gpudata *dst, size_t dstoff, int data) {`
`1048`	`1048`	`return GA_NO_ERROR;`
`1049`	`1049`	`}`
`1050`	`1050`
`1051`		`-static int get_cc(CUdevice dev, int maj, int min, error *e) {`
	`1051`	`+int get_cc(CUdevice dev, int maj, int min, error *e) {`
`1052`	`1052`	`CUresult err;`
`1053`	`1053`	`err = cuDeviceGetAttribute(maj,`
`1054`	`1054`	`CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,`
-Original file line number
+Diff line change
 #endif
 };
 +int get_cc(CUdevice dev, int *maj, int *min, error *e);
++
 #endif