@@ -284,125 +284,3 @@ void store_c_rowmajor_fp32_8rNc(global float* C, float8 v, int rowStart, int col
284284}
285285
286286#endif // defined(cl_intel_subgroups)
287-
288- #ifdef cl_intel_subgroup_2d_block_io
289-
290- // Note for 2D block reads:
291- // - the tile width and height is encoded into the function name.
292- // - base_address is the byte address. Must be 64B aligned.
293- // - width is the width of the entire matrix, in bytes. Must be >= 64B. Must be 4B aligned.
294- // - height is the height of the entire matrix, or equivalently the number of rows.
295- // - pitch is the number of bytes between rows of the entire matrix. Must be >= 64B. Must be a multiple of 8 bytes.
296- // - coord is the number of elements (x coord) and row (y coord) to read from. X coord must be multiple 4 for for 1B data and 2 for 2B data.
297-
298- // For intrinsics, the pattern is:
299- // - prefix: __builtin_IB_subgroup_block_read_flat or __builtin_IB_subgroup_block_write_flat
300- // - operation (optional): _transpose or _transform
301- // - for no transpose or transform:
302- // - type / elements size: _u8 or _u16 or _u32 or _u64
303- // - number of tile rows: _m32 or _m16 or _m8 or _m4 or _m2 or _m1
304- // - tile width: _k64 or _k32 or _k16 or _k8
305- // - number of tiles: _v2 or _v1
306- // - for transpose:
307- // - type / element size: _u64 or _u32
308- // - number of tile rows: subgroup size (16)
309- // - tile width: _k4 (for _u64) or _k8 (for _u32)
310- // - number of tiles: 1
311- // - for transform:
312- // - type / element size: _u16 or _u8
313- // - number of tile rows: _k32 (for _u8) or _k16 (for _u16)
314- // - tile width: subgroup size (16)
315- // - number of tiles: 1
316-
317- enum LSC_LDCC {
318- LSC_LDCC_DEFAULT = 0 ,
319- LSC_LDCC_L1UC_L3UC = 1 , // Override to L1 uncached and L3 uncached
320- LSC_LDCC_L1UC_L3C = 2 , // Override to L1 uncached and L3 cached
321- LSC_LDCC_L1C_L3UC = 3 , // Override to L1 cached and L3 uncached
322- LSC_LDCC_L1C_L3C = 4 , // Override to L1 cached and L3 cached
323- LSC_LDCC_L1S_L3UC = 5 , // Override to L1 streaming load and L3 uncached
324- LSC_LDCC_L1S_L3C = 6 , // Override to L1 streaming load and L3 cached
325- LSC_LDCC_L1IAR_L3C = 7 , // Override to L1 invalidate-after-read, and L3 cached
326- };
327-
328- // Define block reads, prefetches, and writes. These are supported by the hardware but are not in the headers:
329-
330- uint __builtin_IB_subgroup_block_read_flat_u32_m1k8v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
331- uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k8v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
332- uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k8v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
333- uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
334-
335- uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
336- uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
337- uint4 __builtin_IB_subgroup_block_read_flat_u32_m4k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
338- uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
339-
340- uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k8v2 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord );
341-
342- void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint data );
343- void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint2 data );
344- void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint4 data );
345- void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1 (long baseoffset , int width_minus_one , int height_minus_one , int pitch_minus_one , int2 coord , uint8 data );
346-
347- uint intel_sub_group_block_read_32b_1r8c (const __global void * base_address , int width , int height , int pitch , int2 coord )
348- {
349- return __builtin_IB_subgroup_block_read_flat_u32_m1k8v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
350- }
351- uint intel_sub_group_block_read_32b_2r8c (const __global void * base_address , int width , int height , int pitch , int2 coord )
352- {
353- return __builtin_IB_subgroup_block_read_flat_u32_m2k8v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord ).lo ;
354- }
355- uint2 intel_sub_group_block_read_32b_4r8c (const __global void * base_address , int width , int height , int pitch , int2 coord )
356- {
357- return __builtin_IB_subgroup_block_read_flat_u32_m4k8v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord ).lo ;
358- }
359- uint4 intel_sub_group_block_read_32b_8r8c (const __global void * base_address , int width , int height , int pitch , int2 coord )
360- {
361- return __builtin_IB_subgroup_block_read_flat_u32_m8k8v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord ).lo ;
362- }
363-
364- uint intel_sub_group_block_read_32b_1r16c (const __global void * base_address , int width , int height , int pitch , int2 coord )
365- {
366- return __builtin_IB_subgroup_block_read_flat_u32_m1k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
367- }
368- uint2 intel_sub_group_block_read_32b_2r16c (const __global void * base_address , int width , int height , int pitch , int2 coord )
369- {
370- return __builtin_IB_subgroup_block_read_flat_u32_m2k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
371- }
372- uint4 intel_sub_group_block_read_32b_4r16c (const __global void * base_address , int width , int height , int pitch , int2 coord )
373- {
374- return __builtin_IB_subgroup_block_read_flat_u32_m4k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
375- }
376- uint8 intel_sub_group_block_read_32b_8r16c (const __global void * base_address , int width , int height , int pitch , int2 coord )
377- {
378- return __builtin_IB_subgroup_block_read_flat_u32_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
379- }
380-
381- uint8 intel_sub_group_block_read_32b_8r8x2c (const __global void * base_address , int width , int height , int pitch , int2 coord )
382- {
383- return __builtin_IB_subgroup_block_read_flat_u32_m8k8v2 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord );
384- }
385-
386-
387- #if !defined(BLOCK_PREFETCH_CACHE_TYPE )
388- #define BLOCK_PREFETCH_CACHE_TYPE LSC_LDCC_L1C_L3C
389- #endif
390-
391- void intel_sub_group_block_write_32b_1r16c (__global void * base_address , int width , int height , int pitch , int2 coord , uint data )
392- {
393- __builtin_IB_subgroup_block_write_flat_u32_m1k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
394- }
395- void intel_sub_group_block_write_32b_2r16c (__global void * base_address , int width , int height , int pitch , int2 coord , uint2 data )
396- {
397- __builtin_IB_subgroup_block_write_flat_u32_m2k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
398- }
399- void intel_sub_group_block_write_32b_4r16c (__global void * base_address , int width , int height , int pitch , int2 coord , uint4 data )
400- {
401- __builtin_IB_subgroup_block_write_flat_u32_m4k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
402- }
403- void intel_sub_group_block_write_32b_8r16c (__global void * base_address , int width , int height , int pitch , int2 coord , uint8 data )
404- {
405- __builtin_IB_subgroup_block_write_flat_u32_m8k16v1 (as_long (base_address ), width - 1 , height - 1 , pitch - 1 , coord , data );
406- }
407-
408- #endif // cl_intel_subgroup_2d_block_io
0 commit comments