Skip to content

Commit ab2a43a

Browse files
authored
Merge pull request #489 from lamblin/elemwise_padshape
Implicitly left-pad with broadcastable dims
2 parents d4578db + 2c6374e commit ab2a43a

4 files changed

Lines changed: 113 additions & 10 deletions

File tree

pygpu/_elemwise.pyx

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ cdef extern from "gpuarray/elemwise.h":
4040

4141
cdef int GE_BROADCAST
4242
cdef int GE_NOCOLLAPSE
43+
cdef int GE_PADSHAPE
4344

4445

4546
cdef class arg:
@@ -193,9 +194,19 @@ cdef class GpuElemwise:
193194
def __call__(self, *args, **kwargs):
194195
cdef unsigned int i
195196
cdef int err
197+
cdef int flags
198+
199+
flags = 0
200+
if kwargs.pop('broadcast', True):
201+
flags |= GE_BROADCAST
202+
if kwargs.pop('padshape', True):
203+
flags |= GE_PADSHAPE
204+
205+
if len(kwargs) != 0:
206+
raise TypeError("Unknown keyword argument: %s" % list(kwargs.keys())[0])
196207

197208
for i, arg in enumerate(args):
198209
self._setarg(i, arg)
199-
err = GpuElemwise_call(self.ge, self.callbuf, GE_BROADCAST if kwargs.get('broadcast', True) else 0)
210+
err = GpuElemwise_call(self.ge, self.callbuf, flags)
200211
if err != GA_NO_ERROR:
201212
raise get_exc(err)("Could not call GpuElemwise")

src/gpuarray/elemwise.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ GPUARRAY_PUBLIC int GpuElemwise_call(GpuElemwise *ge, void **args, int flags);
156156
*/
157157
#define GE_NOCOLLAPSE 0x0200
158158

159+
/**
160+
* Allow implicit left-padding of shape with dimensions of size 1.
161+
*/
162+
#define GE_PADSHAPE 0x0400
163+
159164
/**
160165
* @}
161166
*/

src/gpuarray_elemwise.c

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -276,14 +276,21 @@ static int check_basic(GpuElemwise *ge, void **args, int flags,
276276
GpuArray *a = NULL, *v;
277277
unsigned int i, j, p, num_arrays = 0, nd = 0, nnd;
278278
int call32 = 1;
279+
unsigned int nd_i = 0;
280+
size_t v_dim_j = 0;
279281

280282
/* Go through the list and grab some info */
281283
for (i = 0; i < ge->n; i++) {
282284
if (is_array(ge->args[i])) {
285+
nd_i = ((GpuArray *)args[i])->nd;
283286
if (num_arrays == 0)
284-
nd = ((GpuArray *)args[i])->nd;
285-
else if (((GpuArray *)args[i])->nd != nd)
286-
return error_fmt(ctx->err, GA_VALUE_ERROR, "Arg %u has differing nd = %u", i, ((GpuArray *)args[i])->nd);
287+
nd = nd_i;
288+
else if (nd_i != nd) {
289+
if (flags & GE_PADSHAPE)
290+
nd = nd_i > nd ? nd_i : nd;
291+
else
292+
return error_fmt(ctx->err, GA_VALUE_ERROR, "Arg %u has differing nd = %u", i, nd_i);
293+
}
287294
++num_arrays;
288295
if (a == NULL && is_output(ge->args[i]))
289296
a = (GpuArray *)args[i];
@@ -301,15 +308,19 @@ static int check_basic(GpuElemwise *ge, void **args, int flags,
301308
return error_sys(ctx->err, "ge_grow");
302309
}
303310

304-
/* Now we know that all array arguments have the same number of
311+
/* Now we know that all array arguments have at most nd
305312
dimensions and that the expected output size is the size of a */
306313

307314
/* And copy their initial values in */
308315
memcpy(ge->dims, a->dimensions, nd*sizeof(size_t));
309316
p = 0;
310317
for (i = 0; i < ge->n; i++) {
311318
if (is_array(ge->args[i])) {
312-
memcpy(ge->strides[p], ((GpuArray *)args[i])->strides, nd*sizeof(ssize_t));
319+
/* Left-pad strides with zero on implicitly broadcasted dimensions */
320+
memset(ge->strides[p], 0, nd*sizeof(ssize_t));
321+
nd_i = ((GpuArray *)args[i])->nd;
322+
memcpy((char *)(ge->strides[p]) + (nd - nd_i)*sizeof(ssize_t),
323+
((GpuArray *)args[i])->strides, nd_i*sizeof(ssize_t));
313324
p++;
314325
}
315326
}
@@ -326,16 +337,23 @@ static int check_basic(GpuElemwise *ge, void **args, int flags,
326337
for (i = 0; i < ge->n; i++) {
327338
if (is_array(ge->args[i])) {
328339
v = (GpuArray *)args[i];
329-
if (ge->dims[j] != v->dimensions[j]) {
340+
nd_i = v->nd;
341+
/* Pad shape with 1 if needed for implicitly broadcasted dimensions
342+
and shift if needed */
343+
if (j < nd - nd_i)
344+
v_dim_j = 1;
345+
else
346+
v_dim_j = v->dimensions[j - (nd - nd_i)];
347+
if (ge->dims[j] != v_dim_j) {
330348
/* We can't broadcast outputs */
331349
if (ISCLR(flags, GE_BROADCAST) || is_output(ge->args[i]) ||
332-
v->dimensions[j] != 1) {
333-
return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u (expected %" SPREFIX "u got %" SPREFIX "u)", j, i, ge->dims[j], v->dimensions[j]);
350+
v_dim_j != 1) {
351+
return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u for input %u (expected %" SPREFIX "u got %" SPREFIX "u)", j, i, ge->dims[j], v_dim_j);
334352
}
335353
}
336354
/* If the dimension is 1 set the strides to 0 regardless since
337355
it won't change anything in the non-broadcast case. */
338-
if (v->dimensions[j] == 1) {
356+
if (v_dim_j == 1) {
339357
ge->strides[p][j] = 0;
340358
}
341359
call32 &= v->offset < ADDR32_MAX;

tests/check_elemwise.c

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ START_TEST(test_basic_offset) {
321321
/* Simulate indexing */
322322
a.offset = 12;
323323
a.dimensions[1] = 3;
324+
GpuArray_fix_flags(&a);
324325

325326
ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1)));
326327

@@ -563,6 +564,73 @@ START_TEST(test_basic_broadcast) {
563564
}
564565
END_TEST
565566

567+
START_TEST(test_basic_padshape) {
568+
GpuArray a;
569+
GpuArray b;
570+
GpuArray c;
571+
572+
GpuElemwise *ge;
573+
574+
static const uint32_t data1[3] = {1, 2, 3};
575+
static const uint32_t data2[2] = {4, 5};
576+
uint32_t data3[6] = {0};
577+
578+
size_t dims[2];
579+
580+
gpuelemwise_arg args[3] = {{0}};
581+
void *rargs[3];
582+
583+
dims[0] = 3;
584+
585+
ga_assert_ok(GpuArray_empty(&a, ctx, GA_UINT, 1, dims, GA_C_ORDER));
586+
ga_assert_ok(GpuArray_write(&a, data1, sizeof(data1)));
587+
588+
dims[0] = 2;
589+
dims[1] = 1;
590+
591+
ga_assert_ok(GpuArray_empty(&b, ctx, GA_UINT, 2, dims, GA_F_ORDER));
592+
ga_assert_ok(GpuArray_write(&b, data2, sizeof(data2)));
593+
594+
dims[0] = 2;
595+
dims[1] = 3;
596+
597+
ga_assert_ok(GpuArray_empty(&c, ctx, GA_UINT, 2, dims, GA_C_ORDER));
598+
599+
args[0].name = "a";
600+
args[0].typecode = GA_UINT;
601+
args[0].flags = GE_READ;
602+
603+
args[1].name = "b";
604+
args[1].typecode = GA_UINT;
605+
args[1].flags = GE_READ;
606+
607+
args[2].name = "c";
608+
args[2].typecode = GA_UINT;
609+
args[2].flags = GE_WRITE;
610+
611+
ge = GpuElemwise_new(ctx, "", "c = a + b", 3, args, 2, 0);
612+
613+
ck_assert_ptr_ne(ge, NULL);
614+
615+
rargs[0] = &a;
616+
rargs[1] = &b;
617+
rargs[2] = &c;
618+
619+
ck_assert_int_eq(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE), GA_VALUE_ERROR);
620+
621+
ga_assert_ok(GpuElemwise_call(ge, rargs, GE_NOCOLLAPSE | GE_BROADCAST | GE_PADSHAPE));
622+
623+
ga_assert_ok(GpuArray_read(data3, sizeof(data3), &c));
624+
625+
ck_assert_int_eq(data3[0], 5);
626+
ck_assert_int_eq(data3[1], 6);
627+
ck_assert_int_eq(data3[2], 7);
628+
ck_assert_int_eq(data3[3], 6);
629+
ck_assert_int_eq(data3[4], 7);
630+
ck_assert_int_eq(data3[5], 8);
631+
}
632+
END_TEST
633+
566634
START_TEST(test_basic_collapse) {
567635
GpuArray a;
568636
GpuArray b;
@@ -755,6 +823,7 @@ Suite *get_suite(void) {
755823
tcase_add_test(tc, test_basic_offset);
756824
tcase_add_test(tc, test_basic_remove1);
757825
tcase_add_test(tc, test_basic_broadcast);
826+
tcase_add_test(tc, test_basic_padshape);
758827
tcase_add_test(tc, test_basic_collapse);
759828
tcase_add_test(tc, test_basic_neg_strides);
760829
tcase_add_test(tc, test_basic_0);

0 commit comments

Comments
 (0)