Skip to content

Commit 8e5f40f

Browse files
authored
Merge pull request #279 from obilaniu/reductions
Make MaxAndArgmax use internally signed axis numbers.
2 parents 8fec16b + cf702b5 commit 8e5f40f

2 files changed

Lines changed: 129 additions & 35 deletions

File tree

src/gpuarray_reduction.c

Lines changed: 41 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -27,27 +27,27 @@ struct maxandargmax_ctx{
2727
GpuArray* dstMax;
2828
GpuArray* dstArgmax;
2929
const GpuArray* src;
30-
unsigned reduxLen;
31-
const unsigned* reduxList;
30+
int reduxLen;
31+
const int* reduxList;
3232

3333
/* General. */
3434
int ret;
35-
unsigned* axisList;
35+
int* axisList;
3636
gpucontext* gpuCtx;
3737

3838
/* Source code Generator. */
3939
const char* dstMaxType;
4040
const char* dstArgmaxType;
41-
unsigned ndd;
42-
unsigned ndr;
43-
unsigned nds;
44-
unsigned ndh;
41+
int ndd;
42+
int ndr;
43+
int nds;
44+
int ndh;
4545
strb s;
4646
char* sourceCode;
4747
GpuKernel kernel;
4848

4949
/* Scheduler */
50-
unsigned hwAxisList[3];
50+
int hwAxisList[3];
5151
size_t blockSize [3];
5252
size_t gridSize [3];
5353
size_t chunkSize [3];
@@ -64,8 +64,8 @@ typedef struct maxandargmax_ctx maxandargmax_ctx;
6464

6565

6666
/* Function prototypes */
67-
static int axisInSet (unsigned v,
68-
const unsigned* set,
67+
static int axisInSet (int v,
68+
const int* set,
6969
size_t setLen,
7070
size_t* where);
7171
static void appendIdxes (strb* s,
@@ -102,7 +102,8 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax,
102102
const GpuArray* src,
103103
unsigned reduxLen,
104104
const unsigned* reduxList){
105-
maxandargmax_ctx ctxSTACK = {dstMax, dstArgmax, src, reduxLen, reduxList},
105+
maxandargmax_ctx ctxSTACK = {dstMax, dstArgmax, src,
106+
(int)reduxLen, (const int*)reduxList},
106107
*ctx = &ctxSTACK;
107108

108109
if(maxandargmaxCheckargs (ctx) == GA_NO_ERROR &&
@@ -127,8 +128,8 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax,
127128
* @return Non-zero if the set is non-empty and v is in it; Zero otherwise.
128129
*/
129130

130-
static int axisInSet (unsigned v,
131-
const unsigned* set,
131+
static int axisInSet (int v,
132+
const int* set,
132133
size_t setLen,
133134
size_t* where){
134135
size_t i;
@@ -190,7 +191,7 @@ static void appendIdxes (strb* s,
190191
*/
191192

192193
static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){
193-
unsigned i;
194+
int i;
194195

195196
/**
196197
* We initialize certain parts of the context.
@@ -216,13 +217,14 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){
216217

217218
/* Insane src or reduxLen? */
218219
if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 ||
219-
ctx->reduxLen == 0 || ctx->reduxLen >= ctx->src->nd){
220+
ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){
220221
return ctx->ret=GA_INVALID_ERROR;
221222
}
222223

223224
/* Insane or duplicate list entry? */
224225
for(i=0;i<ctx->reduxLen;i++){
225-
if(ctx->reduxList[i] >= ctx->src->nd ||
226+
if(ctx->reduxList[i] < 0 ||
227+
ctx->reduxList[i] >= (int)ctx->src->nd ||
226228
axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){
227229
return ctx->ret=GA_INVALID_ERROR;
228230
}
@@ -260,8 +262,8 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){
260262
*/
261263

262264
static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){
263-
unsigned i, j, maxI = 0;
264-
size_t maxV;
265+
int i, j, maxI = 0;
266+
size_t maxV;
265267

266268
ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3;
267269

@@ -355,31 +357,33 @@ static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){
355357
strb_appends(&ctx->s, "\t\n");
356358
}
357359
static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){
358-
unsigned i;
360+
int i;
359361
strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n");
360362

361363
strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n");
362364
strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n");
363365
strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n");
364366
strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n");
365-
strb_appends(&ctx->s, "\tX ");
366-
for(i=0;i<ctx->ndh;i++){
367-
strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
368-
i, i, (i==ctx->ndh-1) ? ";\n" : ", ");
367+
if(ctx->ndh>0){
368+
strb_appends(&ctx->s, "\tX ");
369+
for(i=0;i<ctx->ndh;i++){
370+
strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
371+
i, i, (i==ctx->ndh-1) ? ";\n" : ", ");
372+
}
369373
}
370374

371375
strb_appends(&ctx->s, "\t\n");
372376
strb_appends(&ctx->s, "\t\n");
373377
strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n");
374378

375-
appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");
376-
appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");
377-
appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");
378-
appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");
379-
appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");
380-
appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");
381-
appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");
382-
appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");
379+
if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");}
380+
if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");}
381+
if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");}
382+
if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");}
383+
if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");}
384+
if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");}
385+
if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");}
386+
if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");}
383387

384388
strb_appends(&ctx->s, "\t\n");
385389
strb_appends(&ctx->s, "\t\n");
@@ -605,7 +609,7 @@ static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx){
605609
strb_appends(&ctx->s, "#undef DSTAINDEXER\n");
606610
}
607611
static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){
608-
unsigned i, f=0;
612+
int i, f=0;
609613

610614
for(i=0;i<ctx->nds;i++){
611615
if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
@@ -723,8 +727,10 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){
723727
}
724728
}
725729

726-
dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize;
727-
gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]);
730+
if(ctx->ndh > 0){
731+
dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize;
732+
gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]);
733+
}
728734

729735
/**
730736
* Factorization job. We'll steadily increase the slack in case of failure
@@ -804,7 +810,7 @@ static int maxandargmaxInvoke (maxandargmax_ctx* ctx){
804810
ctx->dstMaxStepsGD &&
805811
ctx->dstArgmaxStepsGD){
806812
ctx->ret = GpuKernel_call(&ctx->kernel,
807-
ctx->ndh,
813+
ctx->ndh>0 ? ctx->ndh : 1,
808814
ctx->blockSize,
809815
ctx->gridSize,
810816
0,

tests/check_reduction.c

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,93 @@ START_TEST(test_veryhighrank){
348348
GpuArray_clear(&gaArgmax);
349349
}END_TEST
350350

351+
START_TEST(test_alldimsreduced){
352+
pcgSeed(1);
353+
354+
/**
355+
* We test here a reduction of some random 3D tensor on all dimensions.
356+
*/
357+
358+
size_t i,j,k;
359+
size_t dims[3] = {32,50,79};
360+
size_t prodDims = dims[0]*dims[1]*dims[2];
361+
const unsigned reduxList[] = {0,1,2};
362+
363+
float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]);
364+
float* pMax = calloc(1, sizeof(*pMax) );
365+
size_t* pArgmax = calloc(1, sizeof(*pArgmax) );
366+
367+
ck_assert_ptr_ne(pSrc, NULL);
368+
ck_assert_ptr_ne(pMax, NULL);
369+
ck_assert_ptr_ne(pArgmax, NULL);
370+
371+
372+
/**
373+
* Initialize source data.
374+
*/
375+
376+
for(i=0;i<prodDims;i++){
377+
pSrc[i] = pcgRand01();
378+
}
379+
380+
381+
/**
382+
* Run the kernel.
383+
*/
384+
385+
GpuArray gaSrc;
386+
GpuArray gaMax;
387+
GpuArray gaArgmax;
388+
389+
ga_assert_ok(GpuArray_empty(&gaSrc, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
390+
ga_assert_ok(GpuArray_empty(&gaMax, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER));
391+
ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE, 0, NULL, GA_C_ORDER));
392+
393+
ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims));
394+
ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */
395+
ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
396+
397+
ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 3, reduxList));
398+
399+
ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax), &gaMax));
400+
ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
401+
402+
403+
/**
404+
* Check that the destination tensors are correct.
405+
*/
406+
407+
size_t gtArgmax = 0;
408+
float gtMax = pSrc[0];
409+
410+
for(i=0;i<dims[0];i++){
411+
for(j=0;j<dims[1];j++){
412+
for(k=0;k<dims[2];k++){
413+
float v = pSrc[(i*dims[1] + j)*dims[2] + k];
414+
415+
if(v > gtMax){
416+
gtMax = v;
417+
gtArgmax = (i*dims[1] + j)*dims[2] + k;
418+
}
419+
}
420+
}
421+
}
422+
423+
ck_assert_msg(gtMax == pMax[0], "Max value mismatch!");
424+
ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!");
425+
426+
/**
427+
* Deallocate.
428+
*/
429+
430+
free(pSrc);
431+
free(pMax);
432+
free(pArgmax);
433+
GpuArray_clear(&gaSrc);
434+
GpuArray_clear(&gaMax);
435+
GpuArray_clear(&gaArgmax);
436+
}END_TEST
437+
351438
Suite *get_suite(void) {
352439
Suite *s = suite_create("reduction");
353440
TCase *tc = tcase_create("basic");
@@ -357,6 +444,7 @@ Suite *get_suite(void) {
357444
tcase_add_test(tc, test_reduction);
358445
tcase_add_test(tc, test_idxtranspose);
359446
tcase_add_test(tc, test_veryhighrank);
447+
tcase_add_test(tc, test_alldimsreduced);
360448

361449
suite_add_tcase(s, tc);
362450
return s;

0 commit comments

Comments
 (0)