Skip to content

Commit d53f327

Browse files
authored
Merge pull request #304 from abergeron/largest_block
Get the largest allocatable block size
2 parents 1a92ce3 + 637783a commit d53f327

9 files changed

Lines changed: 76 additions & 23 deletions

File tree

pygpu/gpuarray.pxd

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@ cdef extern from "numpy/arrayobject.h":
1616
cdef object PyArray_Empty(int a, np.npy_intp *b, np.dtype c, int d)
1717

1818
cdef extern from "Python.h":
19-
int PySlice_GetIndicesEx(slice_object slice, Py_ssize_t length,
19+
int PySlice_GetIndicesEx(object slice, Py_ssize_t length,
2020
Py_ssize_t *start, Py_ssize_t *stop,
2121
Py_ssize_t *step,
2222
Py_ssize_t *slicelength) except -1
2323

24+
cdef extern from "gpuarray/config.h":
25+
int GPUARRAY_API_VERSION
26+
2427
cdef extern from "gpuarray/types.h":
2528
ctypedef struct gpuarray_type:
2629
const char *cluda_name
@@ -100,6 +103,7 @@ cdef extern from "gpuarray/buffer.h":
100103
int GA_CTX_PROP_MAXGSIZE0
101104
int GA_CTX_PROP_MAXGSIZE1
102105
int GA_CTX_PROP_MAXGSIZE2
106+
int GA_CTX_PROP_LARGEST_MEMBLOCK
103107

104108
int GA_BUFFER_PROP_SIZE
105109

@@ -318,8 +322,10 @@ cdef api GpuArray pygpu_concatenate(const _GpuArray **a, size_t n,
318322
object cls, GpuContext context)
319323

320324
cdef api class GpuContext [type PyGpuContextType, object PyGpuContextObject]:
325+
cdef dict __dict__
321326
cdef gpucontext* ctx
322327
cdef readonly bytes kind
328+
cdef object __weakref__
323329

324330
cdef GpuArray new_GpuArray(object cls, GpuContext ctx, object base)
325331

pygpu/gpuarray.pyx

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ from cpython cimport Py_INCREF, PyNumber_Index
1010
from cpython.object cimport Py_EQ, Py_NE
1111

1212
def api_version():
13-
# Those where the last defined numbers.
14-
return (-9997, 1, 0)
13+
# (library version, module version)
14+
return (GPUARRAY_API_VERSION, 0)
1515

1616
np.import_array()
1717

@@ -235,7 +235,7 @@ cdef int strides_ok(GpuArray a, strides):
235235
return 0
236236
upper += max_axis_offset
237237
else:
238-
if lower < -max_axis_offset:
238+
if lower < <size_t>(-max_axis_offset):
239239
return 0
240240
lower += max_axis_offset
241241
return (upper + itemsize) <= size
@@ -874,7 +874,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None,
874874
free(cdims)
875875
free(cstrides)
876876

877-
def array(proto, dtype=None, copy=True, order=None, int ndmin=0,
877+
def array(proto, dtype=None, copy=True, order=None, unsigned int ndmin=0,
878878
GpuContext context=None, cls=None):
879879
"""
880880
array(obj, dtype='float64', copy=True, order=None, ndmin=0, context=None, cls=None)
@@ -890,7 +890,7 @@ def array(proto, dtype=None, copy=True, order=None, int ndmin=0,
890890
:param order: memory layout of the result
891891
:type order: string
892892
:param ndmin: minimum number of result dimensions
893-
:type ndmin: int
893+
:type ndmin: unsigned int
894894
:param context: allocation context
895895
:type context: GpuContext
896896
:param cls: result class (must inherit from GpuArray)
@@ -1146,6 +1146,13 @@ cdef class GpuContext:
11461146
ctx_property(self, GA_CTX_PROP_MAXGSIZE2, &res)
11471147
return res
11481148

1149+
property largest_memblock:
1150+
"Size of the largest memory block you can allocate"
1151+
def __get__(self):
1152+
cdef size_t res
1153+
ctx_property(self, GA_CTX_PROP_LARGEST_MEMBLOCK, &res)
1154+
return res
1155+
11491156

11501157
cdef class flags(object):
11511158
cdef int fl
@@ -1377,21 +1384,24 @@ cdef GpuArray pygpu_reshape(GpuArray a, unsigned int nd, const size_t *newdims,
13771384
if compute_axis < 0:
13781385
array_reshape(res, a, nd, newdims, ord, nocopy)
13791386
return res
1380-
if compute_axis >= nd:
1387+
cdef unsigned int caxis = <unsigned int>compute_axis
1388+
if caxis >= nd:
13811389
raise ValueError("You wanted us to compute the shape of a dimensions that don't exist")
13821390

13831391
cdef size_t *cdims
13841392
cdef size_t tot = 1
1393+
cdef unsigned int i
13851394
for i in range(nd):
1386-
if i != compute_axis:
1395+
if i != caxis:
13871396
tot *= newdims[i]
13881397
cdims = <size_t *>calloc(nd, sizeof(size_t))
13891398
if cdims == NULL:
13901399
raise MemoryError, "could not allocate cdims"
13911400

1401+
cdef size_t d
13921402
for i in range(nd):
13931403
d = newdims[i]
1394-
if i == compute_axis:
1404+
if i == caxis:
13951405
d = a.size // tot
13961406

13971407
if d * tot != a.size:
@@ -1530,7 +1540,7 @@ cdef class GpuArray:
15301540
k = PyNumber_Index(key)
15311541
if k < 0:
15321542
k += self.ga.dimensions[i]
1533-
if k < 0 or k >= self.ga.dimensions[i]:
1543+
if k < 0 or (<size_t>k) >= self.ga.dimensions[i]:
15341544
raise IndexError, "index %d out of bounds" % (i,)
15351545
start[0] = k
15361546
step[0] = 0
@@ -1539,9 +1549,7 @@ cdef class GpuArray:
15391549
pass
15401550

15411551
if isinstance(key, slice):
1542-
# C compiler complains about argument 1 (key) because it's
1543-
# declared as a PyObject. But we know it's a slice so it's ok.
1544-
PySlice_GetIndicesEx(<slice_object>key, self.ga.dimensions[i],
1552+
PySlice_GetIndicesEx(key, self.ga.dimensions[i],
15451553
start, stop, step, &dummy)
15461554
if stop[0] < start[0] and step[0] > 0:
15471555
stop[0] = start[0]

setup.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,32 +97,41 @@ def __init__(self, *args, **kwargs):
9797
fullversion = "%s"
9898
""" % (MAJOR, MINOR, PATCH, SUFFIX, FULLVERSION))
9999

100+
ea = []
101+
if sys.platform in ('darwin', 'linux'):
102+
# Silence unused stuff warnings
103+
ea = ["-Wno-unused-variable", "-Wno-unused-function"]
104+
100105
exts = [Extension('pygpu.gpuarray',
101106
sources=['pygpu/gpuarray.pyx'],
102107
include_dirs=include_dirs,
103108
libraries=['gpuarray'],
104109
library_dirs=library_dirs,
110+
extra_compile_args=ea,
105111
define_macros=[('GPUARRAY_SHARED', None)]
106112
),
107113
Extension('pygpu.blas',
108114
sources=['pygpu/blas.pyx'],
109115
include_dirs=include_dirs,
110116
libraries=['gpuarray'],
111117
library_dirs=library_dirs,
118+
extra_compile_args=ea,
112119
define_macros=[('GPUARRAY_SHARED', None)]
113120
),
114121
Extension('pygpu._elemwise',
115122
sources=['pygpu/_elemwise.pyx'],
116123
include_dirs=include_dirs,
117124
libraries=['gpuarray'],
118125
library_dirs=library_dirs,
126+
extra_compile_args=ea,
119127
define_macros=[('GPUARRAY_SHARED', None)]
120128
),
121129
Extension('pygpu.collectives',
122130
sources=['pygpu/collectives.pyx'],
123131
include_dirs=include_dirs,
124132
libraries=['gpuarray'],
125133
library_dirs=library_dirs,
134+
extra_compile_args=ea,
126135
define_macros=[('GPUARRAY_SHARED', None)]
127136
)]
128137

src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES
8888
INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
8989
MACOSX_RPATH OFF
9090
# This is the shared library version
91-
VERSION 0.0
91+
VERSION 0.1
9292
)
9393

9494
add_library(gpuarray-static STATIC ${GPUARRAY_SRC})

src/gpuarray/buffer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,13 @@ GPUARRAY_PUBLIC gpucontext *gpukernel_context(gpukernel *k);
689689
*/
690690
#define GA_CTX_PROP_PCIBUSID 19
691691

692+
/**
693+
* Get the largest single block of memory that can be allocted.
694+
*
695+
* Type: `size_t`
696+
*/
697+
#define GA_CTX_PROP_LARGEST_MEMBLOCK 20
698+
692699
/* Start at 512 for GA_BUFFER_PROP_ */
693700
#define GA_BUFFER_PROP_START 512
694701

src/gpuarray/config.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#ifndef GPUARRAY_CONFIG
22
#define GPUARRAY_CONFIG
33

4+
#define GPUARRAY_API_VERSION 0
5+
46
#ifdef GPUARRAY_SHARED
57
#ifdef _WIN32
68
#ifdef GPUARRAY_BUILDING_DLL

src/gpuarray_blas_cuda_cublas.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -172,16 +172,12 @@ static int setup(gpucontext *c) {
172172
blas_handle *handle;
173173
const char *tmp[2];
174174
cublasStatus_t err;
175-
int e;
176175
int types[10];
176+
int e;
177177

178178
if (ctx->blas_handle != NULL)
179179
return GA_NO_ERROR;
180180

181-
e = load_libcublas(ctx->major, ctx->minor);
182-
if (e != GA_NO_ERROR)
183-
return e;
184-
185181
handle = calloc(1, sizeof(*handle));
186182
if (handle == NULL)
187183
return GA_MEMORY_ERROR;

src/gpuarray_buffer_cuda.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "private.h"
44
#include "private_cuda.h"
55
#include "loaders/libnvrtc.h"
6+
#include "loaders/libcublas.h"
67

78
#include <sys/types.h>
89

@@ -443,6 +444,21 @@ static void find_best(cuda_context *ctx, gpudata **best, gpudata **prev,
443444
}
444445
}
445446

447+
static size_t largest_size(cuda_context *ctx) {
448+
gpudata *temp;
449+
size_t sz, dummy;
450+
cuda_enter(ctx);
451+
ctx->err = cuMemGetInfo(&sz, &dummy);
452+
cuda_exit(ctx);
453+
/* We guess that we can allocate at least a quarter of the free size
454+
in a single block. This might be wrong though. */
455+
sz /= 4;
456+
for (temp = ctx->freeblocks; temp; temp = temp->next) {
457+
if (temp->sz > sz) sz = temp->sz;
458+
}
459+
return sz;
460+
}
461+
446462
/*
447463
* Allocate a new block and place in on the freelist. Will allocate
448464
* the bigger of the requested size and BLOCK_SIZE to avoid allocating
@@ -1393,6 +1409,7 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
13931409
}
13941410
ctx->err = cuDeviceGetName(s, 256, id);
13951411
if (ctx->err != CUDA_SUCCESS) {
1412+
free(s);
13961413
cuda_exit(ctx);
13971414
return GA_IMPL_ERROR;
13981415
}
@@ -1414,8 +1431,6 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
14141431
}
14151432
ctx->err = cuDeviceGetPCIBusId(s, 13, id);
14161433
if (ctx->err != CUDA_SUCCESS) {
1417-
/* PS: in GA_CTX_PROP_DEVNAME above, s is not freed here.
1418-
* I think it should be freed, isn't it ? */
14191434
free(s);
14201435
cuda_exit(ctx);
14211436
return GA_IMPL_ERROR;
@@ -1424,6 +1439,10 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
14241439
cuda_exit(ctx);
14251440
return GA_NO_ERROR;
14261441

1442+
case GA_CTX_PROP_LARGEST_MEMBLOCK:
1443+
*((size_t *)res) = largest_size(ctx);
1444+
return GA_NO_ERROR;
1445+
14271446
case GA_CTX_PROP_MAXLSIZE:
14281447
cuda_enter(ctx);
14291448
ctx->err = cuCtxGetDevice(&id);
@@ -1494,6 +1513,11 @@ static int cuda_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
14941513
return GA_NO_ERROR;
14951514

14961515
case GA_CTX_PROP_BLAS_OPS:
1516+
{
1517+
int e = load_libcublas(major, minor);
1518+
if (e != GA_NO_ERROR)
1519+
return e;
1520+
}
14971521
*((gpuarray_blas_ops **)res) = &cublas_ops;
14981522
return GA_NO_ERROR;
14991523

src/gpuarray_buffer_opencl.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1292,12 +1292,13 @@ static int cl_property(gpucontext *c, gpudata *buf, gpukernel *k, int prop_id,
12921292
return GA_NO_ERROR;
12931293

12941294
case GA_CTX_PROP_FREE_GMEM:
1295+
/* There is no way to query free memory so we just return the
1296+
largest block size */
1297+
case GA_CTX_PROP_LARGEST_MEMBLOCK:
12951298
ctx->err = clGetContextInfo(ctx->ctx, CL_CONTEXT_DEVICES, sizeof(id), &id,
12961299
NULL);
12971300
if (ctx->err != GA_NO_ERROR)
12981301
return GA_IMPL_ERROR;
1299-
/* XXX: This is not exaclty the amount of free memory but there is
1300-
no way to query that in the OpenCL API. */
13011302
ctx->err = clGetDeviceInfo(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(sz),
13021303
&sz, NULL);
13031304
if (ctx->err != GA_NO_ERROR)

0 commit comments

Comments
 (0)