@@ -336,7 +336,7 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
336336 int flags = GA_USE_CLUDA ;
337337 int res ;
338338
339- nargs = 7 + 2 * v -> nd ;
339+ nargs = 9 + 2 * v -> nd ;
340340
341341 atypes = calloc (nargs , sizeof (int ));
342342 if (atypes == NULL )
@@ -351,34 +351,41 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
351351 }
352352
353353 apos = 0 ;
354- strb_appendf (& sb , "KERNEL void take1(GLOBAL_MEM %s *r, "
355- "GLOBAL_MEM const %s *v, ga_size off ," ,
354+ strb_appendf (& sb , "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, "
355+ "GLOBAL_MEM const %s *v, ga_size v_off ," ,
356356 gpuarray_get_type (a -> typecode )-> cluda_name ,
357357 gpuarray_get_type (v -> typecode )-> cluda_name );
358358 atypes [apos ++ ] = GA_BUFFER ;
359+ atypes [apos ++ ] = GA_SIZE ;
359360 atypes [apos ++ ] = GA_BUFFER ;
360361 atypes [apos ++ ] = GA_SIZE ;
361362 for (i = 0 ; i < v -> nd ; i ++ ) {
362363 strb_appendf (& sb , " ga_ssize s%u, ga_size d%u," , i , i );
363364 atypes [apos ++ ] = GA_SSIZE ;
364365 atypes [apos ++ ] = GA_SIZE ;
365366 }
366- strb_appendf (& sb , " GLOBAL_MEM const %s *ind, ga_size n0, ga_size n1, "
367- " GLOBAL_MEM int* err) {\n" ,
367+ strb_appendf (& sb , " GLOBAL_MEM const %s *ind, ga_size i_off, "
368+ "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n" ,
368369 gpuarray_get_type (ind -> typecode )-> cluda_name );
369370 atypes [apos ++ ] = GA_BUFFER ;
370371 atypes [apos ++ ] = GA_SIZE ;
371372 atypes [apos ++ ] = GA_SIZE ;
373+ atypes [apos ++ ] = GA_SIZE ;
372374 atypes [apos ++ ] = GA_BUFFER ;
373375 assert (apos == nargs );
374376 strb_appendf (& sb , " const %s idx0 = LDIM_0 * GID_0 + LID_0;\n"
375377 " const %s numThreads0 = LDIM_0 * GDIM_0;\n"
376378 " const %s idx1 = LDIM_1 * GID_1 + LID_1;\n"
377379 " const %s numThreads1 = LDIM_1 * GDIM_1;\n"
378380 " %s i0, i1;\n" , sz , sz , sz , sz , sz );
381+ strb_appends (& sb , " if (idx0 >= n0 || idx1 >= n1) return;\n" );
382+ strb_appendf (& sb , " r = (GLOBAL_MEM %s *)(((char *)r) + r_off);\n"
383+ " ind = (GLOBAL_MEM %s *)(((char *)ind) + i_off);\n" ,
384+ gpuarray_get_type (a -> typecode )-> cluda_name ,
385+ gpuarray_get_type (ind -> typecode )-> cluda_name );
379386 strb_appendf (& sb , " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n"
380387 " %s ii0 = ind[i0];\n"
381- " %s pos0 = off ;\n"
388+ " %s pos0 = v_off ;\n"
382389 " if (ii0 < 0) ii0 += d0;\n"
383390 " if ((ii0 < 0) || (ii0 >= d0)) {\n"
384391 " *err = -1;\n"
@@ -500,13 +507,16 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
500507
501508 argp = 0 ;
502509 GpuKernel_setarg (& k , argp ++ , a -> data );
510+ GpuKernel_setarg (& k , argp ++ , (void * )& a -> offset );
503511 GpuKernel_setarg (& k , argp ++ , v -> data );
512+ /* The cast is to avoid a warning about const */
504513 GpuKernel_setarg (& k , argp ++ , (void * )& v -> offset );
505514 for (j = 0 ; j < v -> nd ; j ++ ) {
506515 GpuKernel_setarg (& k , argp ++ , & v -> strides [j ]);
507516 GpuKernel_setarg (& k , argp ++ , & v -> dimensions [j ]);
508517 }
509518 GpuKernel_setarg (& k , argp ++ , i -> data );
519+ GpuKernel_setarg (& k , argp ++ , (void * )& i -> offset );
510520 GpuKernel_setarg (& k , argp ++ , & n [0 ]);
511521 GpuKernel_setarg (& k , argp ++ , & n [1 ]);
512522 GpuKernel_setarg (& k , argp ++ , errbuf );
@@ -1083,11 +1093,15 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) {
10831093 case GA_UINT :
10841094 fprintf (fd , "%u" , * (unsigned int * )p );
10851095 break ;
1096+ case GA_LONG :
1097+ fprintf (fd , "%lld" , (long long )* (int64_t * )p );
1098+ break ;
10861099 case GA_SSIZE :
10871100 fprintf (fd , "%" SPREFIX "d" , * (ssize_t * )p );
10881101 break ;
10891102 default :
10901103 free (buf );
1104+ fprintf (fd , "<unsupported data type %d>\n" , a -> typecode );
10911105 return GA_UNSUPPORTED_ERROR ;
10921106 }
10931107 s -= gpuarray_get_elsize (a -> typecode );
0 commit comments