@@ -630,7 +630,6 @@ void store_c_rowmajor_int32_m8_nx(global int* C, int8 v, int rowStart, int colSt
630630
631631#endif // defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short)
632632
633- #if 0
634633#ifdef cl_intel_subgroup_extended_block_read
635634
636635// Note for 2D block reads:
@@ -689,205 +688,42 @@ enum LSC_LDCC {
689688 LSC_LDCC_L1IAR_L3C = 7 , // Override to L1 invalidate-after-read, and L3 cached
690689};
691690
692- typedef ushort __attribute__ ((ext_vector_type (32 ))) ushort32 ;
693- typedef ushort __attribute__ ((ext_vector_type (64 ))) ushort64 ;
694-
695- typedef uint __attribute__ ((ext_vector_type (32 ))) uint32 ;
696-
697691// Define block reads, prefetches, and writes. These are supported by the hardware but are not in the headers:
698692
699- ushort __builtin_IB_subgroup_block_read_flat_u16_m1k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
700- ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
701- ushort4 __builtin_IB_subgroup_block_read_flat_u16_m4k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
702- ushort8 __builtin_IB_subgroup_block_read_flat_u16_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
703- ushort16 __builtin_IB_subgroup_block_read_flat_u16_m16k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
704- ushort32 __builtin_IB_subgroup_block_read_flat_u16_m32k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
705-
706- ushort32 __builtin_IB_subgroup_block_read_flat_u16_m16k16v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
707- ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
693+ ushort __builtin_IB_subgroup_block_read_flat_u8_m1k32v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
694+ ushort2 __builtin_IB_subgroup_block_read_flat_u8_m2k32v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
695+ ushort4 __builtin_IB_subgroup_block_read_flat_u8_m4k32v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
696+ ushort8 __builtin_IB_subgroup_block_read_flat_u8_m8k32v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
708697
709698uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
710- uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
711-
712- uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k32 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
713-
714- uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
715- uint32 __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
716699
717-
718- void __builtin_IB_subgroup_block_read_prefetch_u16_m1k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
719- void __builtin_IB_subgroup_block_read_prefetch_u16_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
720- void __builtin_IB_subgroup_block_read_prefetch_u16_m4k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
721- void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
722- void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
723- void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
724- void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
725-
726- void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
727- void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
728-
729- void __builtin_IB_subgroup_block_read_prefetch_u32_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
730- void __builtin_IB_subgroup_block_read_prefetch_u32_m16k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , enum LSC_LDCC cache_control );
731-
732-
733- void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint data );
734- void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint2 data );
735- void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint4 data );
736- void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint8 data );
737- void __builtin_IB_subgroup_block_write_flat_u32_m16k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint16 data );
738-
739- ushort intel_subgroup_block_read_u16_m1k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
740- {
741- return __builtin_IB_subgroup_block_read_flat_u16_m1k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
742- }
743- ushort2 intel_subgroup_block_read_u16_m2k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
700+ ushort intel_subgroup_block_read_u8_m1k32 (const __global void * base_address , int width , int height , int pitch , int2 coord )
744701{
745- return __builtin_IB_subgroup_block_read_flat_u16_m2k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
702+ return __builtin_IB_subgroup_block_read_flat_u8_m1k32v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
746703}
747- ushort4 intel_subgroup_block_read_u16_m4k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
704+ ushort2 intel_subgroup_block_read_u8_m2k32 (const __global void * base_address , int width , int height , int pitch , int2 coord )
748705{
749- return __builtin_IB_subgroup_block_read_flat_u16_m4k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
706+ return __builtin_IB_subgroup_block_read_flat_u8_m2k32v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
750707}
751- ushort8 intel_subgroup_block_read_u16_m8k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
708+ ushort4 intel_subgroup_block_read_u8_m4k32 (const __global void * base_address , int width , int height , int pitch , int2 coord )
752709{
753- return __builtin_IB_subgroup_block_read_flat_u16_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
710+ return __builtin_IB_subgroup_block_read_flat_u8_m4k32v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
754711}
755- ushort16 intel_subgroup_block_read_u16_m16k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
712+ ushort8 intel_subgroup_block_read_u8_m8k32 (const __global void * base_address , int width , int height , int pitch , int2 coord )
756713{
757- return __builtin_IB_subgroup_block_read_flat_u16_m16k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
758- }
759- void intel_subgroup_block_read_u16_m32k16 (const __global void * base_address , int width , int height , int pitch , int2 coord , ushort8 dst [4 ])
760- {
761- ushort32 tmp = __builtin_IB_subgroup_block_read_flat_u16_m32k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
762- dst [0 ] = tmp .lo .lo ;
763- dst [1 ] = tmp .lo .hi ;
764- dst [2 ] = tmp .hi .lo ;
765- dst [3 ] = tmp .hi .hi ;
714+ return __builtin_IB_subgroup_block_read_flat_u8_m8k32v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
766715}
767716
768- void intel_subgroup_block_read_u16_m16k16v2 (const __global void * base_address , int width , int height , int pitch , int2 coord , ushort8 dst [2 ][2 ])
769- {
770- ushort32 tmp = __builtin_IB_subgroup_block_read_flat_u16_m16k16v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
771- dst [0 ][0 ] = tmp .lo .lo ;
772- dst [0 ][1 ] = tmp .lo .hi ;
773- dst [1 ][0 ] = tmp .hi .lo ;
774- dst [1 ][1 ] = tmp .hi .hi ;
775- }
776- void intel_subgroup_block_read_u16_m32k16v2 (const __global void * base_address , int width , int height , int pitch , int2 coord , ushort8 dst [2 ][4 ])
777- {
778- ushort64 tmp = __builtin_IB_subgroup_block_read_flat_u16_m32k16v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
779- dst [0 ][0 ] = tmp .lo .lo .lo ;
780- dst [0 ][1 ] = tmp .lo .lo .hi ;
781- dst [0 ][2 ] = tmp .lo .hi .lo ;
782- dst [0 ][3 ] = tmp .lo .hi .hi ;
783- dst [1 ][0 ] = tmp .hi .lo .lo ;
784- dst [1 ][1 ] = tmp .hi .lo .hi ;
785- dst [1 ][2 ] = tmp .hi .hi .lo ;
786- dst [1 ][3 ] = tmp .hi .hi .hi ;
787- }
788-
789- uint8 intel_subgroup_block_read_u32_m8k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
717+ uint8 intel_subgroup_block_read_u32_m8k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
790718{
791719 return __builtin_IB_subgroup_block_read_flat_u32_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
792720}
793- uint16 intel_subgroup_block_read_u32_m16k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
794- {
795- return __builtin_IB_subgroup_block_read_flat_u32_m16k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
796- }
797-
798- // Each block is K rows x N columns, where the K rows have been VNNI transformed.
799- int8 intel_subgroup_block_read_transform_u16_k16n16 (__global void * base_address , int width , int height , int pitch , int2 coord )
800- {
801- // Note: this function is in the headers, but is named confusingly and returns unsigned integers rather than signed integers:
802- return as_int8 (intel_subgroup_block_read_transform_u16_k16 (base_address , width , height , pitch , coord ));
803- }
804- int16 intel_subgroup_block_read_transform_u16_k32n16 (__global void * base_address , int width , int height , int pitch , int2 coord )
805- {
806- return as_int16 (__builtin_IB_subgroup_block_read_flat_transform_u16_k32 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord ));
807- }
808- int16 intel_subgroup_block_read_transform_u16_k16n16v2 (__global void * base_address , int width , int height , int pitch , int2 coord )
809- {
810- return as_int16 (__builtin_IB_subgroup_block_read_flat_transform_u16_k16v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord ));
811- }
812- void intel_subgroup_block_read_transform_u16_k32n16v2 (__global void * base_address , int width , int height , int pitch , int2 coord , int8 dst [2 ][2 ])
813- {
814- uint32 tmp = __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
815- dst [0 ][0 ] = as_int8 (tmp .lo .lo );
816- dst [0 ][1 ] = as_int8 (tmp .lo .hi );
817- dst [1 ][0 ] = as_int8 (tmp .hi .lo );
818- dst [1 ][1 ] = as_int8 (tmp .hi .hi );
819- }
820-
821721
822- #define BLOCK_PREFETCH_CACHE_TYPE LSC_LDCC_L1C_L3C
823-
824- void intel_subgroup_block_prefetch_u16_m1k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
825- {
826- #if defined(PREFETCH_DEFAULT )
827- __builtin_IB_subgroup_block_read_prefetch_u16_m1k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
828- #endif // defined(PREFETCH_DEFAULT)
829- }
830- void intel_subgroup_block_prefetch_u16_m2k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
831- {
832- #if defined(PREFETCH_DEFAULT )
833- __builtin_IB_subgroup_block_read_prefetch_u16_m2k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
834- #endif // defined(PREFETCH_DEFAULT)
835- }
836- void intel_subgroup_block_prefetch_u16_m4k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
837- {
838- #if defined(PREFETCH_DEFAULT )
839- __builtin_IB_subgroup_block_read_prefetch_u16_m4k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
840- #endif // defined(PREFETCH_DEFAULT)
841- }
842- void intel_subgroup_block_prefetch_u16_m8k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
843- {
844- #if defined(PREFETCH_DEFAULT )
845- __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
846- #endif // defined(PREFETCH_DEFAULT)
847- }
848- void intel_subgroup_block_prefetch_u16_m8k16v2 (__global void * base_address , int width , int height , int pitch , int2 coord )
849- {
850- #if defined(PREFETCH_DEFAULT )
851- __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
852- #endif // defined(PREFETCH_DEFAULT)
853- }
854- void intel_subgroup_block_prefetch_u16_m16k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
855- {
856- #if defined(PREFETCH_DEFAULT )
857- __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
858- #endif // defined(PREFETCH_DEFAULT)
859- }
860- void intel_subgroup_block_prefetch_u16_m32k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
861- {
862- #if defined(PREFETCH_DEFAULT )
863- __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
864- #endif // defined(PREFETCH_DEFAULT)
865- }
866- void intel_subgroup_block_prefetch_u16_m16k16v2 (const __global void * base_address , int width , int height , int pitch , int2 coord )
867- {
868- #if defined(PREFETCH_DEFAULT )
869- __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
870- #endif // defined(PREFETCH_DEFAULT)
871- }
872- void intel_subgroup_block_prefetch_u16_m32k16v2 (const __global void * base_address , int width , int height , int pitch , int2 coord )
873- {
874- #if defined(PREFETCH_DEFAULT )
875- __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
876- #endif // defined(PREFETCH_DEFAULT)
877- }
878- void intel_subgroup_block_prefetch_u32_m8k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
879- {
880- #if defined(PREFETCH_DEFAULT )
881- __builtin_IB_subgroup_block_read_prefetch_u32_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
882- #endif // defined(PREFETCH_DEFAULT)
883- }
884- void intel_subgroup_block_prefetch_u32_m16k16 (const __global void * base_address , int width , int height , int pitch , int2 coord )
885- {
886- #if defined(PREFETCH_DEFAULT )
887- __builtin_IB_subgroup_block_read_prefetch_u32_m16k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , BLOCK_PREFETCH_CACHE_TYPE );
888- #endif // defined(PREFETCH_DEFAULT)
889- }
890722
723+ void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint data );
724+ void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint2 data );
725+ void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint4 data );
726+ void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint8 data );
891727
892728void intel_subgroup_block_write_u32_m1k16 (__global void * base_address , int width , int height , int pitch , int2 coord , uint data )
893729{
@@ -905,10 +741,5 @@ void intel_subgroup_block_write_u32_m8k16(__global void* base_address, int width
905741{
906742 __builtin_IB_subgroup_block_write_flat_u32_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
907743}
908- void intel_subgroup_block_write_u32_m16k16 (__global void * base_address , int width , int height , int pitch , int2 coord , uint16 data )
909- {
910- __builtin_IB_subgroup_block_write_flat_u32_m16k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
911- }
912744
913745#endif // cl_intel_subgroup_extended_block_read
914- #endif
0 commit comments