Skip to content

Commit 8e6df64

Browse files
authored
Merge pull request #270 from abergeron/send_buffer
Add a way to use the cuda IPC functionality to send and receive buffers across processes.
2 parents f2c3227 + f13cb65 commit 8e6df64

7 files changed

Lines changed: 107 additions & 14 deletions

File tree

pygpu/gpuarray.pxd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,9 @@ cdef extern from "gpuarray/array.h":
196196

197197
cdef extern from "gpuarray/extension.h":
198198
void *gpuarray_get_extension(const char *)
199+
ctypedef struct GpuArrayIpcMemHandle:
200+
pass
201+
199202
cdef int GPUARRAY_CUDA_CTX_NOFREE
200203

201204
cdef type get_exc(int errcode)

pygpu/gpuarray.pyx

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -789,7 +789,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None,
789789
:type shape: iterable of ints
790790
:param context: context of the gpudata
791791
:type context: GpuContext
792-
:param strides: strides for the results
792+
:param strides: strides for the results (C contiguous if not specified)
793793
:type strides: iterable of ints
794794
:param writable: is the data writable?
795795
:type writeable: bool
@@ -839,7 +839,7 @@ def from_gpudata(size_t data, offset, dtype, shape, GpuContext context=None,
839839
else:
840840
size = gpuarray_get_elsize(typecode)
841841
for i in range(nd-1, -1, -1):
842-
strides[i] = size
842+
cstrides[i] = size
843843
size *= cdims[i]
844844

845845
return pygpu_fromgpudata(<gpudata *>data, offset, typecode, nd, cdims,
@@ -1424,6 +1424,33 @@ def _concatenate(list al, unsigned int axis, int restype, object cls,
14241424
finally:
14251425
PyMem_Free(als)
14261426

1427+
cdef int (*cuda_get_ipc_handle)(gpudata *, GpuArrayIpcMemHandle *)
1428+
cdef gpudata *(*cuda_open_ipc_handle)(gpucontext *, GpuArrayIpcMemHandle *, size_t)
1429+
1430+
cuda_get_ipc_handle = <int (*)(gpudata *, GpuArrayIpcMemHandle *)>gpuarray_get_extension("cuda_get_ipc_handle")
1431+
cuda_open_ipc_handle = <gpudata *(*)(gpucontext *, GpuArrayIpcMemHandle *, size_t)>gpuarray_get_extension("cuda_open_ipc_handle")
1432+
1433+
def open_ipc_handle(GpuContext c, bytes hpy, size_t l):
1434+
"""
1435+
Open an IPC handle to get a new GpuArray from it.
1436+
1437+
:param c: context
1438+
:param hpy: binary handle data received
1439+
:param l: size of the referred memory block
1440+
1441+
"""
1442+
cdef char *b
1443+
cdef GpuArrayIpcMemHandle h
1444+
cdef gpudata *d
1445+
1446+
b = hpy
1447+
memcpy(&h, b, sizeof(h))
1448+
1449+
d = cuda_open_ipc_handle(c.ctx, &h, l)
1450+
if d is NULL:
1451+
raise GpuArrayException, "could not open handle"
1452+
return <size_t>d
1453+
14271454
cdef class GpuArray:
14281455
"""
14291456
Device array
@@ -1561,6 +1588,19 @@ cdef class GpuArray:
15611588
raise ValueError, "GpuArray and Numpy array do not have the same size in bytes"
15621589
array_read(np.PyArray_DATA(dst), sz, self)
15631590

1591+
def get_ipc_handle(self):
1592+
cdef GpuArrayIpcMemHandle h
1593+
cdef int err
1594+
if cuda_get_ipc_handle is NULL:
1595+
raise SystemError, "Could not get necessary extension"
1596+
if self.context.kind != b'cuda':
1597+
raise ValueError, "Only works for cuda contexts"
1598+
err = cuda_get_ipc_handle(self.ga.data, &h)
1599+
if err != GA_NO_ERROR:
1600+
raise get_exc(err), GpuArray_error(&self.ga, err)
1601+
res = <bytes>(<char *>&h)[:sizeof(h)]
1602+
return res
1603+
15641604
def __array__(self):
15651605
"""
15661606
__array__()

src/gpuarray/ext_cuda.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@ static void (*cuda_exit)(gpucontext *);
1616
static gpucontext *(*cuda_make_ctx)(CUcontext, int);
1717
static CUstream (*cuda_get_stream)(void *);
1818
static gpudata *(*cuda_make_buf)(void *, CUdeviceptr, size_t);
19-
static CUdeviceptr (*cuda_get_ptr)(gpudata *);
2019
static size_t (*cuda_get_sz)(gpudata *);
2120
static int (*cuda_wait)(gpudata *, int);
2221
static int (*cuda_record)(gpudata *, int);
22+
static CUipcMemHandle (*cuda_get_ipc_handle)(gpudata *d);
23+
static gpudata *(*cuda_open_ipc_handle)(gpucontext *c, CUipcMemHandle h,
24+
size_t sz);
2325

2426
static void setup_ext_cuda(void) {
2527
// The casts are necessary to reassure C++ compilers
@@ -28,10 +30,11 @@ static void setup_ext_cuda(void) {
2830
cuda_make_ctx = (gpucontext *(*)(CUcontext, int))gpuarray_get_extension("cuda_make_ctx");
2931
cuda_get_stream = (CUstream (*)(void *))gpuarray_get_extension("cuda_get_stream");
3032
cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");
31-
cuda_get_ptr = (CUdeviceptr (*)(gpudata *))gpuarray_get_extension("cuda_get_ptr");
3233
cuda_get_sz = (size_t (*)(gpudata *))gpuarray_get_extension("cuda_get_sz");
3334
cuda_wait = (int (*)(gpudata *, int))gpuarray_get_extension("cuda_wait");
3435
cuda_record = (int (*)(gpudata *, int))gpuarray_get_extension("cuda_record");
36+
cuda_get_ipc_handle = (CUipcMemHandle (*)(gpudata *))gpuarray_get_extension("cuda_get_ipc_handle");
37+
cuda_open_ipc_handle = (gpudata *(*)(gpucontext *c, CUipcMemHandle h, size_t sz))gpuarray_get_extension("cuda_open_ipc_handle");
3538
}
3639

3740
#ifdef __cplusplus

src/gpuarray/extension.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ extern "C" {
1919
#define GPUARRAY_CUDA_WAIT_READ 0x10000 /* CUDA_WAIT_READ */
2020
#define GPUARRAY_CUDA_WAIT_WRITE 0x20000 /* CUDA_WAIT_WRITE */
2121

22+
typedef struct _GpuArrayIpcMemHandle {
23+
char priv[64];
24+
} GpuArrayIpcMemHandle;
25+
2226
/**
2327
* Obtain a function pointer for an extension.
2428
*

src/gpuarray_buffer_cuda.c

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,15 @@
1616
#include "gpuarray/buffer.h"
1717
#include "gpuarray/util.h"
1818
#include "gpuarray/error.h"
19-
#include "gpuarray/extension.h"
2019
#include "gpuarray/buffer_blas.h"
2120

21+
#include "gpuarray/extension.h"
22+
23+
STATIC_ASSERT(DONTFREE == GPUARRAY_CUDA_CTX_NOFREE, cuda_nofree_eq);
24+
STATIC_ASSERT(CUDA_WAIT_READ == GPUARRAY_CUDA_WAIT_READ, cuda_wait_read_eq);
25+
STATIC_ASSERT(CUDA_WAIT_WRITE == GPUARRAY_CUDA_WAIT_WRITE, cuda_wait_write_eq);
26+
STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcmem_eq);
27+
2228
/* Allocations will be made in blocks of at least this size */
2329
#define BLOCK_SIZE (4 * 1024 * 1024)
2430

@@ -555,6 +561,32 @@ static gpudata *cuda_alloc(gpucontext *c, size_t size, void *data, int flags,
555561
return res;
556562
}
557563

564+
int cuda_get_ipc_handle(gpudata *d, GpuArrayIpcMemHandle *h) {
565+
ASSERT_BUF(d);
566+
cuda_enter(d->ctx);
567+
CUDA_EXIT_ON_ERROR(d->ctx,
568+
cuIpcGetMemHandle((CUipcMemHandle *)h, d->ptr));
569+
cuda_exit(d->ctx);
570+
return GA_NO_ERROR;
571+
}
572+
573+
gpudata *cuda_open_ipc_handle(gpucontext *c, GpuArrayIpcMemHandle *h, size_t sz) {
574+
CUdeviceptr p;
575+
cuda_context *ctx = (cuda_context *)c;
576+
gpudata *d = NULL;
577+
578+
cuda_enter(ctx);
579+
ctx->err = cuIpcOpenMemHandle(&p, *((CUipcMemHandle *)h),
580+
CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
581+
if (ctx->err == CUDA_SUCCESS) {
582+
d = cuda_make_buf(ctx, p, sz);
583+
if (d != NULL)
584+
d->flags |= CUDA_IPC_MEMORY;
585+
}
586+
cuda_exit(ctx);
587+
return d;
588+
}
589+
558590
static void cuda_retain(gpudata *d) {
559591
ASSERT_BUF(d);
560592
d->refcnt++;
@@ -580,6 +612,9 @@ static void cuda_free(gpudata *d) {
580612
if (d->flags & DONTFREE) {
581613
/* This is the path for "external" buffers */
582614
deallocate(d);
615+
} else if (d->flags & CUDA_IPC_MEMORY) {
616+
cuIpcCloseMemHandle(d->ptr);
617+
deallocate(d);
583618
} else if (ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE) {
584619
/* Just free the pointer */
585620
cuMemFree(d->ptr);
@@ -1354,12 +1389,16 @@ static int cuda_sync(gpudata *b) {
13541389

13551390
ASSERT_BUF(b);
13561391
cuda_enter(ctx);
1357-
ctx->err = cuEventSynchronize(b->wev);
1358-
if (ctx->err != CUDA_SUCCESS)
1359-
err = GA_IMPL_ERROR;
1360-
ctx->err = cuEventSynchronize(b->rev);
1361-
if (ctx->err != CUDA_SUCCESS)
1362-
err = GA_IMPL_ERROR;
1392+
if (ctx->flags & GA_CTX_SINGLE_STREAM) {
1393+
cuStreamSynchronize(ctx->s);
1394+
} else {
1395+
ctx->err = cuEventSynchronize(b->wev);
1396+
if (ctx->err != CUDA_SUCCESS)
1397+
err = GA_IMPL_ERROR;
1398+
ctx->err = cuEventSynchronize(b->rev);
1399+
if (ctx->err != CUDA_SUCCESS)
1400+
err = GA_IMPL_ERROR;
1401+
}
13631402
cuda_exit(ctx);
13641403
return err;
13651404
}

src/gpuarray_extension.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ extern void *cuda_make_buf(void);
1616
extern void *cuda_get_sz(void);
1717
extern void *cuda_wait(void);
1818
extern void *cuda_record(void);
19+
extern void *cuda_get_ipc_handle(void);
20+
extern void *cuda_open_ipc_handle(void);
1921
#endif
2022
#ifdef WITH_OPENCL
2123
extern void *cl_make_ctx(void);
@@ -34,6 +36,8 @@ static ext ext_list[] = {
3436
{"cuda_get_sz", cuda_get_sz},
3537
{"cuda_wait", cuda_wait},
3638
{"cuda_record", cuda_record},
39+
{"cuda_get_ipc_handle", cuda_get_ipc_handle},
40+
{"cuda_open_ipc_handle", cuda_open_ipc_handle},
3741
#endif
3842
#ifdef WITH_OPENCL
3943
{"cl_make_ctx", cl_make_ctx},

src/private_cuda.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ struct _gpudata {
123123

124124
GPUARRAY_LOCAL gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p,
125125
size_t sz);
126-
GPUARRAY_LOCAL CUdeviceptr cuda_get_ptr(gpudata *g);
127126
GPUARRAY_LOCAL size_t cuda_get_sz(gpudata *g);
128127
GPUARRAY_LOCAL int cuda_wait(gpudata *, int);
129128
GPUARRAY_LOCAL int cuda_record(gpudata *, int);
@@ -135,8 +134,9 @@ GPUARRAY_LOCAL int cuda_record(gpudata *, int);
135134

136135
#define CUDA_WAIT_ALL (CUDA_WAIT_READ|CUDA_WAIT_WRITE)
137136

138-
#define CUDA_HEAD_ALLOC 0x40000
139-
#define CUDA_MAPPED_PTR 0x80000
137+
#define CUDA_IPC_MEMORY 0x100000
138+
#define CUDA_HEAD_ALLOC 0x200000
139+
#define CUDA_MAPPED_PTR 0x400000
140140

141141
struct _gpukernel {
142142
cuda_context *ctx; /* Keep the context first */

0 commit comments

Comments
 (0)