@@ -380,7 +380,7 @@ kernel void i8_dpas_vnni_m8_n16(global int* C, global char* A, global char* B, i
380380 store_c_rowmajor_int32_m8_nx (C , sum , m , n , N );
381381}
382382
383- #ifdef cl_intel_subgroup_extended_block_read
383+ #ifdef cl_intel_subgroup_2d_block_io
384384
385385__attribute__((intel_reqd_sub_group_size (16 ))) __attribute__((reqd_work_group_size (16 , 1 , 1 )))
386386kernel void i8_dpas_blockread_rowmajor_m1_n16 (global int * C , global char * A , global char * B , int K )
@@ -395,13 +395,15 @@ kernel void i8_dpas_blockread_rowmajor_m1_n16(global int* C, global char* A, glo
395395
396396 int sum = 0 ;
397397 for (int k = 0 ; k < K ; k += tK ) {
398- short aData = as_short (intel_subgroup_block_read_u8_m1k32 (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m )));
399- int8 bData = as_int8 (intel_subgroup_block_read_transform_u8_k32 (B , N * sizeof (char ), K , N * sizeof (char ), (int2 )(n , k )));
398+ short aData ;
399+ intel_sub_group_2d_block_read_8b_1r32x1c (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m ), (ushort * )& aData );
400+ int8 bData ;
401+ intel_sub_group_2d_block_read_transform_8b_32r16x1c (B , N * sizeof (char ), K , N * sizeof (char ), (int2 )(n , k ), (uint * )& bData );
400402 sum = mat_mul_sg16 (aData , bData , sum );
401403 }
402404
403405 sum = activation (sum );
404- intel_subgroup_block_write_u32_m1k16 (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), as_uint ( sum ) );
406+ intel_sub_group_2d_block_write_32b_1r16x1c (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), ( uint * ) & sum );
405407}
406408
407409__attribute__((intel_reqd_sub_group_size (16 ))) __attribute__((reqd_work_group_size (16 , 1 , 1 )))
@@ -417,13 +419,15 @@ kernel void i8_dpas_blockread_rowmajor_m2_n16(global int* C, global char* A, glo
417419
418420 int2 sum = 0 ;
419421 for (int k = 0 ; k < K ; k += tK ) {
420- short2 aData = as_short2 (intel_subgroup_block_read_u8_m2k32 (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m )));
421- int8 bData = as_int8 (intel_subgroup_block_read_transform_u8_k32 (B , N * sizeof (char ), K , N * sizeof (char ), (int2 )(n , k )));
422+ short2 aData ;
423+ intel_sub_group_2d_block_read_8b_2r32x1c (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m ), (ushort * )& aData );
424+ int8 bData ;
425+ intel_sub_group_2d_block_read_transform_8b_32r16x1c (B , N * sizeof (char ), K , N * sizeof (char ), (int2 )(n , k ), (uint * )& bData );
422426 sum = mat_mul_sg16 (aData , bData , sum );
423427 }
424428
425429 sum = activation (sum );
426- intel_subgroup_block_write_u32_m2k16 (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), as_uint2 ( sum ) );
430+ intel_sub_group_2d_block_write_32b_2r16x1c (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), ( uint * ) & sum );
427431}
428432
429433__attribute__((intel_reqd_sub_group_size (16 ))) __attribute__((reqd_work_group_size (16 , 1 , 1 )))
@@ -439,13 +443,15 @@ kernel void i8_dpas_blockread_rowmajor_m4_n16(global int* C, global char* A, glo
439443
440444 int4 sum = 0 ;
441445 for (int k = 0 ; k < K ; k += tK ) {
442- short4 aData = as_short4 (intel_subgroup_block_read_u8_m4k32 (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m )));
443- int8 bData = as_int8 (intel_subgroup_block_read_transform_u8_k32 (B , N * sizeof (char ), K , N * sizeof (char ), (int2 )(n , k )));
446+ short4 aData ;
447+ intel_sub_group_2d_block_read_8b_4r32x1c (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m ), (ushort * )& aData );
448+ int8 bData ;
449+ intel_sub_group_2d_block_read_transform_8b_32r16x1c (B , N * sizeof (char ), K , N * sizeof (char ), (int2 )(n , k ), (uint * )& bData );
444450 sum = mat_mul_sg16 (aData , bData , sum );
445451 }
446452
447453 sum = activation (sum );
448- intel_subgroup_block_write_u32_m4k16 (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), as_uint4 ( sum ) );
454+ intel_sub_group_2d_block_write_32b_4r16x1c (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), ( uint * ) & sum );
449455}
450456
451457__attribute__((intel_reqd_sub_group_size (16 ))) __attribute__((reqd_work_group_size (16 , 1 , 1 )))
@@ -461,13 +467,15 @@ kernel void i8_dpas_blockread_rowmajor_m8_n16(global int* C, global char* A, glo
461467
462468 int8 sum = 0 ;
463469 for (int k = 0 ; k < K ; k += tK ) {
464- short8 aData = as_short8 (intel_subgroup_block_read_u8_m8k32 (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m )));
465- int8 bData = as_int8 (intel_subgroup_block_read_transform_u8_k32 (B , N * sizeof (char ), K , N * sizeof (char ), (int2 )(n , k )));
470+ short8 aData ;
471+ intel_sub_group_2d_block_read_8b_8r32x1c (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m ), (ushort * )& aData );
472+ int8 bData ;
473+ intel_sub_group_2d_block_read_transform_8b_32r16x1c (B , N * sizeof (char ), K , N * sizeof (char ), (int2 )(n , k ), (uint * )& bData );
466474 sum = mat_mul_sg16 (aData , bData , sum );
467475 }
468476
469477 sum = activation (sum );
470- intel_subgroup_block_write_u32_m8k16 (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), as_uint8 ( sum ) );
478+ intel_sub_group_2d_block_write_32b_8r16x1c (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), ( uint * ) & sum );
471479}
472480
473481__attribute__((intel_reqd_sub_group_size (16 ))) __attribute__((reqd_work_group_size (16 , 1 , 1 )))
@@ -483,13 +491,15 @@ kernel void i8_dpas_blockread_vnni_m1_n16(global int* C, global char* A, global
483491
484492 int sum = 0 ;
485493 for (int k = 0 ; k < K ; k += tK ) {
486- short aData = as_short (intel_subgroup_block_read_u8_m1k32 (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m )));
487- int8 bData = as_int8 (intel_subgroup_block_read_u32_m8k16 (B , N * sizeof (uint ), K , N * sizeof (uint ), (int2 )(n , k / 4 )));
494+ short aData ;
495+ intel_sub_group_2d_block_read_8b_1r32x1c (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m ), (ushort * )& aData );
496+ int8 bData ;
497+ intel_sub_group_2d_block_read_32b_8r16x1c (B , N * sizeof (uint ), K , N * sizeof (uint ), (int2 )(n , k / 4 ), (uint * )& bData );
488498 sum = mat_mul_sg16 (aData , bData , sum );
489499 }
490500
491501 sum = activation (sum );
492- intel_subgroup_block_write_u32_m1k16 (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), as_uint ( sum ) );
502+ intel_sub_group_2d_block_write_32b_1r16x1c (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), ( uint * ) & sum );
493503}
494504
495505__attribute__((intel_reqd_sub_group_size (16 ))) __attribute__((reqd_work_group_size (16 , 1 , 1 )))
@@ -505,13 +515,15 @@ kernel void i8_dpas_blockread_vnni_m2_n16(global int* C, global char* A, global
505515
506516 int2 sum = 0 ;
507517 for (int k = 0 ; k < K ; k += tK ) {
508- short2 aData = as_short2 (intel_subgroup_block_read_u8_m2k32 (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m )));
509- int8 bData = as_int8 (intel_subgroup_block_read_u32_m8k16 (B , N * sizeof (uint ), K , N * sizeof (uint ), (int2 )(n , k / 4 )));
518+ short2 aData ;
519+ intel_sub_group_2d_block_read_8b_2r32x1c (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m ), (ushort * )& aData );
520+ int8 bData ;
521+ intel_sub_group_2d_block_read_32b_8r16x1c (B , N * sizeof (uint ), K , N * sizeof (uint ), (int2 )(n , k / 4 ), (uint * )& bData );
510522 sum = mat_mul_sg16 (aData , bData , sum );
511523 }
512524
513525 sum = activation (sum );
514- intel_subgroup_block_write_u32_m2k16 (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), as_uint2 ( sum ) );
526+ intel_sub_group_2d_block_write_32b_2r16x1c (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), ( uint * ) & sum );
515527}
516528
517529__attribute__((intel_reqd_sub_group_size (16 ))) __attribute__((reqd_work_group_size (16 , 1 , 1 )))
@@ -527,13 +539,15 @@ kernel void i8_dpas_blockread_vnni_m4_n16(global int* C, global char* A, global
527539
528540 int4 sum = 0 ;
529541 for (int k = 0 ; k < K ; k += tK ) {
530- short4 aData = as_short4 (intel_subgroup_block_read_u8_m4k32 (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m )));
531- int8 bData = as_int8 (intel_subgroup_block_read_u32_m8k16 (B , N * sizeof (uint ), K , N * sizeof (uint ), (int2 )(n , k / 4 )));
542+ short4 aData ;
543+ intel_sub_group_2d_block_read_8b_4r32x1c (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m ), (ushort * )& aData );
544+ int8 bData ;
545+ intel_sub_group_2d_block_read_32b_8r16x1c (B , N * sizeof (uint ), K , N * sizeof (uint ), (int2 )(n , k / 4 ), (uint * )& bData );
532546 sum = mat_mul_sg16 (aData , bData , sum );
533547 }
534548
535549 sum = activation (sum );
536- intel_subgroup_block_write_u32_m4k16 (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), as_uint4 ( sum ) );
550+ intel_sub_group_2d_block_write_32b_4r16x1c (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), ( uint * ) & sum );
537551}
538552
539553__attribute__((intel_reqd_sub_group_size (16 ))) __attribute__((reqd_work_group_size (16 , 1 , 1 )))
@@ -549,16 +563,18 @@ kernel void i8_dpas_blockread_vnni_m8_n16(global int* C, global char* A, global
549563
550564 int8 sum = 0 ;
551565 for (int k = 0 ; k < K ; k += tK ) {
552- short8 aData = as_short8 (intel_subgroup_block_read_u8_m8k32 (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m )));
553- int8 bData = as_int8 (intel_subgroup_block_read_u32_m8k16 (B , N * sizeof (uint ), K , N * sizeof (uint ), (int2 )(n , k / 4 )));
566+ short8 aData ;
567+ intel_sub_group_2d_block_read_8b_8r32x1c (A , K * sizeof (char ), M , K * sizeof (char ), (int2 )(k , m ), (ushort * )& aData );
568+ int8 bData ;
569+ intel_sub_group_2d_block_read_32b_8r16x1c (B , N * sizeof (uint ), K , N * sizeof (uint ), (int2 )(n , k / 4 ), (uint * )& bData );
554570 sum = mat_mul_sg16 (aData , bData , sum );
555571 }
556572
557573 sum = activation (sum );
558- intel_subgroup_block_write_u32_m8k16 (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), as_uint8 ( sum ) );
574+ intel_sub_group_2d_block_write_32b_8r16x1c (C , N * sizeof (float ), M , N * sizeof (float ), (int2 )(n , m ), ( uint * ) & sum );
559575}
560576
561- #endif // cl_intel_subgroup_extended_block_read
577+ #endif // cl_intel_subgroup_2d_block_io
562578
563579#if 0 // disable the tiled cases for now
564580
0 commit comments