Merge pull request #304 from abergeron/largest_block

nouiz · web-flow · commit d53f327df441 · 2016-12-01T13:10:15.000-05:00
Get the largest allocatable block size
diff --git a/pygpu/gpuarray.pxd b/pygpu/gpuarray.pxd
@@ -16,11 +16,14 @@ cdef extern from "numpy/arrayobject.h":
 cdef object PyArray_Empty(int a, np.npy_intp *b, np.dtype c, int d)
 
 cdef extern from "Python.h":
-    int PySlice_GetIndicesEx(slice_object slice, Py_ssize_t length,
+    int PySlice_GetIndicesEx(object slice, Py_ssize_t length,
                              Py_ssize_t *start, Py_ssize_t *stop,
                              Py_ssize_t *step,
                              Py_ssize_t *slicelength) except -1
 
+cdef extern from "gpuarray/config.h":
+    int GPUARRAY_API_VERSION
+
 cdef extern from "gpuarray/types.h":
     ctypedef struct gpuarray_type:
         const char *cluda_name
@@ -100,6 +103,7 @@ cdef extern from "gpuarray/buffer.h":
     int GA_CTX_PROP_MAXGSIZE0
     int GA_CTX_PROP_MAXGSIZE1
     int GA_CTX_PROP_MAXGSIZE2
+    int GA_CTX_PROP_LARGEST_MEMBLOCK
 
     int GA_BUFFER_PROP_SIZE
 
@@ -318,8 +322,10 @@ cdef api GpuArray pygpu_concatenate(const _GpuArray **a, size_t n,
                                     object cls, GpuContext context)
 
 cdef api class GpuContext [type PyGpuContextType, object PyGpuContextObject]:
+    cdef dict __dict__
     cdef gpucontext* ctx
     cdef readonly bytes kind
+    cdef object __weakref__
 
 cdef GpuArray new_GpuArray(object cls, GpuContext ctx, object base)
 
diff --git a/pygpu/gpuarray.pyx b/pygpu/gpuarray.pyx
@@ -10,8 +10,8 @@ from cpython cimport Py_INCREF, PyNumber_Index
 from cpython.object cimport Py_EQ, Py_NE
 
 def api_version():
-    # Those where the last defined numbers.
-    return (-9997, 1, 0)
+    # (library version, module version)
+    return (GPUARRAY_API_VERSION, 0)
 
 np.import_array()
 
@@ -235,7 +235,7 @@ cdef int strides_ok(GpuArray a, strides):
                 return 0
             upper += max_axis_offset
         else:
-            if lower < -max_axis_offset:
+            if lower < <size_t>(-max_axis_offset):
                 return 0
             lower += max_axis_offset
     return (upper + itemsize) <= size
@@ -874,7 +874,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None,
         free(cdims)
         free(cstrides)
 
-def array(proto, dtype=None, copy=True, order=None, int ndmin=0,
+def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0,
           GpuContext context=None, cls=None):
     """
     array(obj, dtype='float64', copy=True, order=None, ndmin=0, context=None, cls=None)
@@ -890,7 +890,7 @@ def array(proto, dtype=None, copy=True, order=None, int ndmin=0,
     :param order: memory layout of the result
     :type order: string
     :param ndmin: minimum number of result dimensions
-    :type ndmin: int
+    :type ndmin: unsigned int
     :param context: allocation context
     :type context: GpuContext
     :param cls: result class (must inherit from GpuArray)
@@ -1146,6 +1146,13 @@ cdef class GpuContext:
             ctx_property(self, GA_CTX_PROP_MAXGSIZE2, &res)
             return res
 
+    property largest_memblock:
+        "Size of the largest memory block you can allocate"
+        def __get__(self):
+            cdef size_t res
+            ctx_property(self, GA_CTX_PROP_LARGEST_MEMBLOCK, &res)
+            return res
+
 
 cdef class flags(object):
     cdef int fl
@@ -1377,21 +1384,24 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims,
     if compute_axis < 0:
         array_reshape(res, a, nd, newdims, ord, nocopy)
         return res
-    if compute_axis >= nd:
+    cdef unsigned int caxis = <unsigned int>compute_axis
+    if caxis >= nd:
         raise ValueError("You wanted us to compute the shape of a dimensions that don't exist")
 
     cdef size_t *cdims
     cdef size_t tot = 1
+    cdef unsigned int i
     for i in range(nd):
-        if i != compute_axis:
+        if i != caxis:
             tot *= newdims[i]
     cdims = <size_t *>calloc(nd, sizeof(size_t))
     if cdims == NULL:
         raise MemoryError, "could not allocate cdims"
 
+    cdef size_t d
     for i in range(nd):
         d = newdims[i]
-        if i == compute_axis:
+        if i == caxis:
             d = a.size // tot
 
             if d * tot != a.size:
@@ -1530,7 +1540,7 @@ cdef class GpuArray:
             k = PyNumber_Index(key)
             if k < 0:
                 k += self.ga.dimensions[i]
-            if k < 0 or k >= self.ga.dimensions[i]:
+            if k < 0 or (<size_t>k) >= self.ga.dimensions[i]:
                 raise IndexError, "index %d out of bounds" % (i,)
             start[0] = k
             step[0] = 0
@@ -1539,9 +1549,7 @@ cdef class GpuArray:
             pass
 
         if isinstance(key, slice):
-            # C compiler complains about argument 1 (key) because it's
-            # declared as a PyObject.  But we know it's a slice so it's ok.
-            PySlice_GetIndicesEx(<slice_object>key, self.ga.dimensions[i],
+            PySlice_GetIndicesEx(key, self.ga.dimensions[i],
                                  start, stop, step, &dummy)
             if stop[0] < start[0] and step[0] > 0:
                 stop[0] = start[0]
diff --git a/setup.py b/setup.py
@@ -97,32 +97,41 @@ def __init__(self, *args, **kwargs):
 fullversion = "%s"
 """ % (MAJOR, MINOR, PATCH, SUFFIX, FULLVERSION))
 
+ea = []
+if sys.platform in ('darwin', 'linux'):
+    # Silence unused stuff warnings
+    ea = ["-Wno-unused-variable", "-Wno-unused-function"]
+
 exts = [Extension('pygpu.gpuarray',
                   sources=['pygpu/gpuarray.pyx'],
                   include_dirs=include_dirs,
                   libraries=['gpuarray'],
                   library_dirs=library_dirs,
+                  extra_compile_args=ea,
                   define_macros=[('GPUARRAY_SHARED', None)]
                   ),
         Extension('pygpu.blas',
                   sources=['pygpu/blas.pyx'],
                   include_dirs=include_dirs,
                   libraries=['gpuarray'],
                   library_dirs=library_dirs,
+                  extra_compile_args=ea,
                   define_macros=[('GPUARRAY_SHARED', None)]
                   ),
         Extension('pygpu._elemwise',
                   sources=['pygpu/_elemwise.pyx'],
                   include_dirs=include_dirs,
                   libraries=['gpuarray'],
                   library_dirs=library_dirs,
+                  extra_compile_args=ea,
                   define_macros=[('GPUARRAY_SHARED', None)]
                   ),
         Extension('pygpu.collectives',
                   sources=['pygpu/collectives.pyx'],
                   include_dirs=include_dirs,
                   libraries=['gpuarray'],
                   library_dirs=library_dirs,
+                  extra_compile_args=ea,
                   define_macros=[('GPUARRAY_SHARED', None)]
                   )]
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES
   INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
   MACOSX_RPATH OFF
   # This is the shared library version
-  VERSION 0.0
+  VERSION 0.1
   )
 
 add_library(gpuarray-static STATIC ${GPUARRAY_SRC})
diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
@@ -689,6 +689,13 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k);
  */
 #define GA_CTX_PROP_PCIBUSID 19
 
+/**
+ * Get the largest single block of memory that can be allocted.
+ *
+ * Type: `size_t`
+ */
+#define GA_CTX_PROP_LARGEST_MEMBLOCK 20
+
 /* Start at 512 for GA_BUFFER_PROP_ */
 #define GA_BUFFER_PROP_START  512
 
diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h
@@ -1,6 +1,8 @@
 #ifndef GPUARRAY_CONFIG
 #define GPUARRAY_CONFIG
 
+#define GPUARRAY_API_VERSION 0
+
 #ifdef GPUARRAY_SHARED
  #ifdef _WIN32
   #ifdef GPUARRAY_BUILDING_DLL
diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c
@@ -172,16 +172,12 @@ static int setup(gpucontext *c) {
   blas_handle *handle;
   const char *tmp[2];
   cublasStatus_t err;
-  int e;
   int types[10];
+  int e;
 
   if (ctx->blas_handle != NULL)
     return GA_NO_ERROR;
 
-  e = load_libcublas(ctx->major, ctx->minor);
-  if (e != GA_NO_ERROR)
-    return e;
-
   handle = calloc(1, sizeof(*handle));
   if (handle == NULL)
     return GA_MEMORY_ERROR;
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
@@ -3,6 +3,7 @@
 #include "private.h"
 #include "private_cuda.h"
 #include "loaders/libnvrtc.h"
+#include "loaders/libcublas.h"
 
 #include <sys/types.h>
 
@@ -443,6 +444,21 @@ static void find_best(cuda_context *ctx, gpudata **best, gpudata **prev,
   }
 }
 
+static size_t largest_size(cuda_context *ctx) {
+  gpudata *temp;
+  size_t sz, dummy;
+  cuda_enter(ctx);
+  ctx->err = cuMemGetInfo(&sz, &dummy);
+  cuda_exit(ctx);
+   /* We guess that we can allocate at least a quarter of the free size
+     in a single block. This might be wrong though. */
+  sz /= 4;
+  for (temp = ctx->freeblocks; temp; temp = temp->next) {
+    if (temp->sz > sz) sz = temp->sz;
+  }
+  return sz;
+}
+
 /*
  * Allocate a new block and place in on the freelist. Will allocate
  * the bigger of the requested size and BLOCK_SIZE to avoid allocating
@@ -1393,6 +1409,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     }
     ctx->err = cuDeviceGetName(s, 256, id);
     if (ctx->err != CUDA_SUCCESS) {
+      free(s);
       cuda_exit(ctx);
       return GA_IMPL_ERROR;
     }
@@ -1414,8 +1431,6 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     }
     ctx->err = cuDeviceGetPCIBusId(s, 13, id);
     if (ctx->err != CUDA_SUCCESS) {
-      /* PS: in GA_CTX_PROP_DEVNAME above, s is not freed here.
-       * I think it should be freed, isn't it ? */
       free(s);
       cuda_exit(ctx);
       return GA_IMPL_ERROR;
@@ -1424,6 +1439,10 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     cuda_exit(ctx);
     return GA_NO_ERROR;
 
+  case GA_CTX_PROP_LARGEST_MEMBLOCK:
+    *((size_t *)res) = largest_size(ctx);
+    return GA_NO_ERROR;
+
   case GA_CTX_PROP_MAXLSIZE:
     cuda_enter(ctx);
     ctx->err = cuCtxGetDevice(&id);
@@ -1494,6 +1513,11 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_BLAS_OPS:
+    {
+      int e = load_libcublas(major, minor);
+      if (e != GA_NO_ERROR)
+        return e;
+    }
     *((gpuarray_blas_ops **)res) = &cublas_ops;
     return GA_NO_ERROR;
 
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
@@ -1292,12 +1292,13 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
     return GA_NO_ERROR;
 
   case GA_CTX_PROP_FREE_GMEM:
+    /* There is no way to query free memory so we just return the
+        largest block size */
+  case GA_CTX_PROP_LARGEST_MEMBLOCK:
     ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id,
                                 NULL);
     if (ctx->err != GA_NO_ERROR)
       return GA_IMPL_ERROR;
-    /* XXX: This is not exaclty the amount of free memory but there is
-       no way to query that in the OpenCL API. */
     ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(sz),
                                &sz, NULL);
     if (ctx->err != GA_NO_ERROR)

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES`
`88`	`88`	`INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib`
`89`	`89`	`MACOSX_RPATH OFF`
`90`	`90`	`# This is the shared library version`
`91`		`- VERSION 0.0`
	`91`	`+ VERSION 0.1`
`92`	`92`	`)`
`93`	`93`
`94`	`94`	`add_library(gpuarray-static STATIC ${GPUARRAY_SRC})`