You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
voidHELPER_NAME(atile_block_load_rowmajor, MM, NN)(globalushort*A, inttM, intM, intK, intm, intk, short8aData[KK][MM])
411
411
{
@@ -415,49 +415,52 @@ void HELPER_NAME(atile_block_load_rowmajor, MM, NN)(global ushort* A, int tM, in
415
415
//if (get_sub_group_local_id() == 0) {
416
416
// printf("atile block load : %d, %d, %2d: m = %3d, k = %3d, mm = %2d, kk = %2d, coord = %3d, %3d\n", (int)get_group_id(1), (int)get_group_id(0), get_sub_group_id(), m, k, mm, kk, k + kk * tK, m + mm * tM);
417
417
//}
418
-
ushort8tmp[2][4];
419
-
intel_sub_group_block_read_16b_32r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM), tmp);
418
+
short8aTemp[2][4];
419
+
intel_sub_group_2d_block_read_16b_32r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM), (ushort*)aTemp);
420
420
for (inttkk=0; tkk<2; tkk++) {
421
421
for (inttmm=0; tmm<4; tmm++) {
422
-
aData[kk+tkk][mm+tmm] =as_short8(tmp[tkk][tmm]);
422
+
aData[kk+tkk][mm+tmm] =aTemp[tkk][tmm];
423
423
}
424
424
}
425
425
}
426
426
}
427
427
} elseif (KK % 2==0&MM % 2==0) {
428
428
for (intkk=0; kk<KK; kk+=2) {
429
429
for (intmm=0; mm<MM; mm+=2) {
430
-
ushort8tmp[2][2];
431
-
intel_sub_group_block_read_16b_16r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM), tmp);
430
+
short8aTemp[2][2];
431
+
intel_sub_group_2d_block_read_16b_16r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM), (ushort*)aTemp);
432
432
for (inttkk=0; tkk<2; tkk++) {
433
433
for (inttmm=0; tmm<2; tmm++) {
434
-
aData[kk+tkk][mm+tmm] =as_short8(tmp[tkk][tmm]);
434
+
aData[kk+tkk][mm+tmm] =aTemp[tkk][tmm];
435
435
}
436
436
}
437
437
}
438
438
}
439
439
} elseif (KK % 2==0) {
440
440
for (intkk=0; kk<KK; kk+=2) {
441
441
for (intmm=0; mm<MM; mm++) {
442
-
short16aTemp=as_short16(intel_sub_group_block_read_16b_8r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM)));
443
-
aData[kk+0][mm] =aTemp.lo;
444
-
aData[kk+1][mm] =aTemp.hi;
442
+
short8aTemp[2];
443
+
intel_sub_group_2d_block_read_16b_8r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM), (ushort*)aTemp);
444
+
aData[kk+0][mm] =aTemp[0];
445
+
aData[kk+1][mm] =aTemp[1];
445
446
}
446
447
}
447
448
} elseif (MM % 4==0) {
448
449
for (intkk=0; kk<KK; kk++) {
449
450
for (intmm=0; mm<MM; mm+=4) {
450
-
ushort8tmp[4];
451
-
intel_sub_group_block_read_16b_32r16c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM), tmp);
451
+
short8aTemp[4];
452
+
intel_sub_group_2d_block_read_16b_32r16x1c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM), (ushort*)aTemp);
452
453
for (inttmm=0; tmm<4; tmm++) {
453
-
aData[kk][mm+tmm] =as_short8(tmp[tmm]);
454
+
aData[kk][mm+tmm] =aTemp[tmm];
454
455
}
455
456
}
456
457
}
457
458
} else {
458
459
for (intkk=0; kk<KK; kk++) {
459
460
for (intmm=0; mm<MM; mm++) {
460
-
aData[kk][mm] =as_short8(intel_sub_group_block_read_16b_8r16c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM)));
461
+
short8aTemp[1];
462
+
intel_sub_group_2d_block_read_16b_8r16x1c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM), (ushort*)aTemp);
463
+
aData[kk][mm] =aTemp[0];
461
464
}
462
465
}
463
466
}
@@ -471,35 +474,39 @@ void HELPER_NAME(btile_block_load_rowmajor, MM, NN)(global ushort* B, int tN, in
471
474
//if (get_sub_group_local_id() == 0) {
472
475
// printf("btile block load: %d, %d, %2d: n = %3d, k = %3d, nn = %2d, kk = %2d, coord = %3d, %3d\n", (int)get_group_id(1), (int)get_group_id(0), get_sub_group_id(), n, k, nn, kk, n + nn * tN, k + kk * tK);
@@ -533,39 +543,35 @@ void HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(global ushort* A, int tM
533
543
//if (get_sub_group_local_id() == 0) {
534
544
// printf("atile block prefetch: %d, %d, %2d: sg_x = %d, m = %3d, k = %3d, mm = %2d, kk = %2d, coord = %3d, %3d\n", (int)get_group_id(1), (int)get_group_id(0), get_sub_group_id(), sg_index_x, m, k, mm, kk, k + kk * tK, m + mm * tM);
535
545
//}
536
-
#ifdefUSE_32C
537
-
intel_sub_group_block_prefetch_16b_8r32c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
538
-
#else
539
-
intel_sub_group_block_prefetch_16b_8r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
540
-
#endif
546
+
intel_sub_group_2d_block_prefetch_16b_8r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
541
547
} elseif (KK % 2==0&MM % 4==0) {
542
548
for (intkk=0; kk<KK; kk+=2) {
543
549
for (intmm=0; mm<MM; mm+=4) {
544
-
intel_sub_group_block_prefetch_16b_32r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
550
+
intel_sub_group_2d_block_prefetch_16b_32r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
545
551
}
546
552
}
547
553
} elseif (KK % 2==0&MM % 2==0) {
548
554
for (intkk=0; kk<KK; kk+=2) {
549
555
for (intmm=0; mm<MM; mm+=2) {
550
-
intel_sub_group_block_prefetch_16b_16r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
556
+
intel_sub_group_2d_block_prefetch_16b_16r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
551
557
}
552
558
}
553
559
} elseif (KK % 2==0) {
554
560
for (intkk=0; kk<KK; kk+=2) {
555
561
for (intmm=0; mm<MM; mm++) {
556
-
intel_sub_group_block_prefetch_16b_8r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
562
+
intel_sub_group_2d_block_prefetch_16b_8r16x2c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
557
563
}
558
564
}
559
565
} elseif (MM % 4==0) {
560
566
for (intkk=0; kk<KK; kk++) {
561
567
for (intmm=0; mm<MM; mm+=4) {
562
-
intel_sub_group_block_prefetch_16b_32r16c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
568
+
intel_sub_group_2d_block_prefetch_16b_32r16x1c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
563
569
}
564
570
}
565
571
} else {
566
572
for (intkk=0; kk<KK; kk++) {
567
573
for (intmm=0; mm<MM; mm++) {
568
-
intel_sub_group_block_prefetch_16b_8r16c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
574
+
intel_sub_group_2d_block_prefetch_16b_8r16x1c(A, K*sizeof(ushort), M, K*sizeof(ushort), (int2)(k+kk*tK, m+mm*tM));
569
575
}
570
576
}
571
577
}
@@ -580,33 +586,29 @@ void HELPER_NAME(btile_block_prefetch_rowmajor, MM, NN)(global ushort* B, int tN
580
586
//if (get_sub_group_local_id() == 0) {
581
587
// printf("btile block prefetch: %d, %d, %2d: sg_y = %d, n = %3d, k = %3d, nn = %2d, kk = %2d, coord = %3d, %3d\n", (int)get_group_id(1), (int)get_group_id(0), get_sub_group_id(), sg_index_y, n, k, nn, kk, n + nn * tN, k + kk * tK);
0 commit comments