Skip to content

Commit 41159a8

Browse files
committed
switch block read functions to the production names
Tiled kernels still need to be enabled and ported.
1 parent dddfdf3 commit 41159a8

2 files changed

Lines changed: 50 additions & 26 deletions

File tree

samples/99_matrixexperimentsi8/main.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,8 @@ static void i8_dpas_blockread_rowmajor(
443443
cl::Kernel kernel{program, kernelName.c_str()};
444444
if (kernel() == nullptr) {
445445
printf("unsupported.\n");
446+
} else if (K < 64 || N < 64) {
447+
printf("matrix pitch for block reads must be >= 64 bytes.\n");
446448
} else {
447449
kernel.setArg(0, C);
448450
kernel.setArg(1, A);
@@ -502,6 +504,8 @@ static void i8_dpas_blockread_rowmajor_tiled(
502504
printf("M is too small.\n");
503505
} else if (tN * NN > N) {
504506
printf("N is too small.\n");
507+
} else if (K < 64 || N < 64) {
508+
printf("matrix pitch for block reads must be >= 64 bytes.\n");
505509
} else {
506510
kernel.setArg(0, C);
507511
kernel.setArg(1, A);
@@ -555,6 +559,8 @@ static void i8_dpas_blockread_vnni(
555559
cl::Kernel kernel{program, kernelName.c_str()};
556560
if (kernel() == nullptr) {
557561
printf("unsupported.\n");
562+
} else if (K < 64 || N < 64/4) {
563+
printf("matrix pitch for block reads must be >= 64 bytes.\n");
558564
} else {
559565
kernel.setArg(0, C);
560566
kernel.setArg(1, A);
@@ -614,6 +620,8 @@ static void i8_dpas_blockread_vnni_tiled(
614620
printf("M is too small.\n");
615621
} else if (tN * NN > N) {
616622
printf("N is too small.\n");
623+
} else if (K < 64 || N < 64/4) {
624+
printf("matrix pitch for block reads must be >= 64 bytes.\n");
617625
} else {
618626
kernel.setArg(0, C);
619627
kernel.setArg(1, A);

samples/99_matrixexperimentsi8/matrix_kernels_i8.cl

Lines changed: 42 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ kernel void i8_dpas_vnni_m8_n16(global int* C, global char* A, global char* B, i
380380
store_c_rowmajor_int32_m8_nx(C, sum, m, n, N);
381381
}
382382

383-
#ifdef cl_intel_subgroup_extended_block_read
383+
#ifdef cl_intel_subgroup_2d_block_io
384384

385385
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
386386
kernel void i8_dpas_blockread_rowmajor_m1_n16(global int* C, global char* A, global char* B, int K)
@@ -395,13 +395,15 @@ kernel void i8_dpas_blockread_rowmajor_m1_n16(global int* C, global char* A, glo
395395

396396
int sum = 0;
397397
for (int k = 0; k < K; k += tK) {
398-
short aData = as_short(intel_subgroup_block_read_u8_m1k32(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m)));
399-
int8 bData = as_int8(intel_subgroup_block_read_transform_u8_k32(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k)));
398+
short aData;
399+
intel_sub_group_2d_block_read_8b_1r32x1c(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m), (ushort*)&aData);
400+
int8 bData;
401+
intel_sub_group_2d_block_read_transform_8b_32r16x1c(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k), (uint*)&bData);
400402
sum = mat_mul_sg16(aData, bData, sum);
401403
}
402404

403405
sum = activation(sum);
404-
intel_subgroup_block_write_u32_m1k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum));
406+
intel_sub_group_2d_block_write_32b_1r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
405407
}
406408

407409
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -417,13 +419,15 @@ kernel void i8_dpas_blockread_rowmajor_m2_n16(global int* C, global char* A, glo
417419

418420
int2 sum = 0;
419421
for (int k = 0; k < K; k += tK) {
420-
short2 aData = as_short2(intel_subgroup_block_read_u8_m2k32(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m)));
421-
int8 bData = as_int8(intel_subgroup_block_read_transform_u8_k32(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k)));
422+
short2 aData;
423+
intel_sub_group_2d_block_read_8b_2r32x1c(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m), (ushort*)&aData);
424+
int8 bData;
425+
intel_sub_group_2d_block_read_transform_8b_32r16x1c(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k), (uint*)&bData);
422426
sum = mat_mul_sg16(aData, bData, sum);
423427
}
424428

425429
sum = activation(sum);
426-
intel_subgroup_block_write_u32_m2k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum));
430+
intel_sub_group_2d_block_write_32b_2r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
427431
}
428432

429433
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -439,13 +443,15 @@ kernel void i8_dpas_blockread_rowmajor_m4_n16(global int* C, global char* A, glo
439443

440444
int4 sum = 0;
441445
for (int k = 0; k < K; k += tK) {
442-
short4 aData = as_short4(intel_subgroup_block_read_u8_m4k32(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m)));
443-
int8 bData = as_int8(intel_subgroup_block_read_transform_u8_k32(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k)));
446+
short4 aData;
447+
intel_sub_group_2d_block_read_8b_4r32x1c(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m), (ushort*)&aData);
448+
int8 bData;
449+
intel_sub_group_2d_block_read_transform_8b_32r16x1c(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k), (uint*)&bData);
444450
sum = mat_mul_sg16(aData, bData, sum);
445451
}
446452

447453
sum = activation(sum);
448-
intel_subgroup_block_write_u32_m4k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum));
454+
intel_sub_group_2d_block_write_32b_4r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
449455
}
450456

451457
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -461,13 +467,15 @@ kernel void i8_dpas_blockread_rowmajor_m8_n16(global int* C, global char* A, glo
461467

462468
int8 sum = 0;
463469
for (int k = 0; k < K; k += tK) {
464-
short8 aData = as_short8(intel_subgroup_block_read_u8_m8k32(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m)));
465-
int8 bData = as_int8(intel_subgroup_block_read_transform_u8_k32(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k)));
470+
short8 aData;
471+
intel_sub_group_2d_block_read_8b_8r32x1c(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m), (ushort*)&aData);
472+
int8 bData;
473+
intel_sub_group_2d_block_read_transform_8b_32r16x1c(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k), (uint*)&bData);
466474
sum = mat_mul_sg16(aData, bData, sum);
467475
}
468476

469477
sum = activation(sum);
470-
intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum));
478+
intel_sub_group_2d_block_write_32b_8r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
471479
}
472480

473481
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -483,13 +491,15 @@ kernel void i8_dpas_blockread_vnni_m1_n16(global int* C, global char* A, global
483491

484492
int sum = 0;
485493
for (int k = 0; k < K; k += tK) {
486-
short aData = as_short(intel_subgroup_block_read_u8_m1k32(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m)));
487-
int8 bData = as_int8(intel_subgroup_block_read_u32_m8k16(B, N * sizeof(uint), K, N * sizeof(uint), (int2)(n, k / 4)));
494+
short aData;
495+
intel_sub_group_2d_block_read_8b_1r32x1c(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m), (ushort*)&aData);
496+
int8 bData;
497+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(uint), K, N * sizeof(uint), (int2)(n, k / 4), (uint*)&bData);
488498
sum = mat_mul_sg16(aData, bData, sum);
489499
}
490500

491501
sum = activation(sum);
492-
intel_subgroup_block_write_u32_m1k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint(sum));
502+
intel_sub_group_2d_block_write_32b_1r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
493503
}
494504

495505
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -505,13 +515,15 @@ kernel void i8_dpas_blockread_vnni_m2_n16(global int* C, global char* A, global
505515

506516
int2 sum = 0;
507517
for (int k = 0; k < K; k += tK) {
508-
short2 aData = as_short2(intel_subgroup_block_read_u8_m2k32(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m)));
509-
int8 bData = as_int8(intel_subgroup_block_read_u32_m8k16(B, N * sizeof(uint), K, N * sizeof(uint), (int2)(n, k / 4)));
518+
short2 aData;
519+
intel_sub_group_2d_block_read_8b_2r32x1c(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m), (ushort*)&aData);
520+
int8 bData;
521+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(uint), K, N * sizeof(uint), (int2)(n, k / 4), (uint*)&bData);
510522
sum = mat_mul_sg16(aData, bData, sum);
511523
}
512524

513525
sum = activation(sum);
514-
intel_subgroup_block_write_u32_m2k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint2(sum));
526+
intel_sub_group_2d_block_write_32b_2r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
515527
}
516528

517529
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -527,13 +539,15 @@ kernel void i8_dpas_blockread_vnni_m4_n16(global int* C, global char* A, global
527539

528540
int4 sum = 0;
529541
for (int k = 0; k < K; k += tK) {
530-
short4 aData = as_short4(intel_subgroup_block_read_u8_m4k32(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m)));
531-
int8 bData = as_int8(intel_subgroup_block_read_u32_m8k16(B, N * sizeof(uint), K, N * sizeof(uint), (int2)(n, k / 4)));
542+
short4 aData;
543+
intel_sub_group_2d_block_read_8b_4r32x1c(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m), (ushort*)&aData);
544+
int8 bData;
545+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(uint), K, N * sizeof(uint), (int2)(n, k / 4), (uint*)&bData);
532546
sum = mat_mul_sg16(aData, bData, sum);
533547
}
534548

535549
sum = activation(sum);
536-
intel_subgroup_block_write_u32_m4k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint4(sum));
550+
intel_sub_group_2d_block_write_32b_4r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
537551
}
538552

539553
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
@@ -549,16 +563,18 @@ kernel void i8_dpas_blockread_vnni_m8_n16(global int* C, global char* A, global
549563

550564
int8 sum = 0;
551565
for (int k = 0; k < K; k += tK) {
552-
short8 aData = as_short8(intel_subgroup_block_read_u8_m8k32(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m)));
553-
int8 bData = as_int8(intel_subgroup_block_read_u32_m8k16(B, N * sizeof(uint), K, N * sizeof(uint), (int2)(n, k / 4)));
566+
short8 aData;
567+
intel_sub_group_2d_block_read_8b_8r32x1c(A, K * sizeof(char), M, K * sizeof(char), (int2)(k, m), (ushort*)&aData);
568+
int8 bData;
569+
intel_sub_group_2d_block_read_32b_8r16x1c(B, N * sizeof(uint), K, N * sizeof(uint), (int2)(n, k / 4), (uint*)&bData);
554570
sum = mat_mul_sg16(aData, bData, sum);
555571
}
556572

557573
sum = activation(sum);
558-
intel_subgroup_block_write_u32_m8k16(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), as_uint8(sum));
574+
intel_sub_group_2d_block_write_32b_8r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
559575
}
560576

561-
#endif // cl_intel_subgroup_extended_block_read
577+
#endif // cl_intel_subgroup_2d_block_io
562578

563579
#if 0 // disable the tiled cases for now
564580

0 commit comments

Comments
 (0)