Skip to content

Commit bb48fd4

Browse files
committed
enable more int8 samples
1 parent bb11953 commit bb48fd4

3 files changed

Lines changed: 36 additions & 207 deletions

File tree

samples/99_matrixexperimentsi8/matrix_helpers_i8.cl

Lines changed: 17 additions & 186 deletions
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,6 @@ void store_c_rowmajor_int32_m8_nx(global int* C, int8 v, int rowStart, int colSt
630630

631631
#endif // defined(cl_intel_subgroups) && defined(cl_intel_subgroups_short)
632632

633-
#if 0
634633
#ifdef cl_intel_subgroup_extended_block_read
635634

636635
// Note for 2D block reads:
@@ -689,205 +688,42 @@ enum LSC_LDCC {
689688
LSC_LDCC_L1IAR_L3C = 7, // Override to L1 invalidate-after-read, and L3 cached
690689
};
691690

692-
typedef ushort __attribute__((ext_vector_type(32))) ushort32;
693-
typedef ushort __attribute__((ext_vector_type(64))) ushort64;
694-
695-
typedef uint __attribute__((ext_vector_type(32))) uint32;
696-
697691
// Define block reads, prefetches, and writes. These are supported by the hardware but are not in the headers:
698692

699-
ushort __builtin_IB_subgroup_block_read_flat_u16_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
700-
ushort2 __builtin_IB_subgroup_block_read_flat_u16_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
701-
ushort4 __builtin_IB_subgroup_block_read_flat_u16_m4k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
702-
ushort8 __builtin_IB_subgroup_block_read_flat_u16_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
703-
ushort16 __builtin_IB_subgroup_block_read_flat_u16_m16k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
704-
ushort32 __builtin_IB_subgroup_block_read_flat_u16_m32k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
705-
706-
ushort32 __builtin_IB_subgroup_block_read_flat_u16_m16k16v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
707-
ushort64 __builtin_IB_subgroup_block_read_flat_u16_m32k16v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
693+
ushort __builtin_IB_subgroup_block_read_flat_u8_m1k32v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
694+
ushort2 __builtin_IB_subgroup_block_read_flat_u8_m2k32v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
695+
ushort4 __builtin_IB_subgroup_block_read_flat_u8_m4k32v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
696+
ushort8 __builtin_IB_subgroup_block_read_flat_u8_m8k32v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
708697

709698
uint8 __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
710-
uint16 __builtin_IB_subgroup_block_read_flat_u32_m16k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
711-
712-
uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k32(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
713-
714-
uint16 __builtin_IB_subgroup_block_read_flat_transform_u16_k16v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
715-
uint32 __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
716699

717-
718-
void __builtin_IB_subgroup_block_read_prefetch_u16_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
719-
void __builtin_IB_subgroup_block_read_prefetch_u16_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
720-
void __builtin_IB_subgroup_block_read_prefetch_u16_m4k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
721-
void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
722-
void __builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
723-
void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
724-
void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
725-
726-
void __builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
727-
void __builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
728-
729-
void __builtin_IB_subgroup_block_read_prefetch_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
730-
void __builtin_IB_subgroup_block_read_prefetch_u32_m16k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, enum LSC_LDCC cache_control);
731-
732-
733-
void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint data);
734-
void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint2 data);
735-
void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint4 data);
736-
void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint8 data);
737-
void __builtin_IB_subgroup_block_write_flat_u32_m16k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint16 data);
738-
739-
ushort intel_subgroup_block_read_u16_m1k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
740-
{
741-
return __builtin_IB_subgroup_block_read_flat_u16_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
742-
}
743-
ushort2 intel_subgroup_block_read_u16_m2k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
700+
ushort intel_subgroup_block_read_u8_m1k32(const __global void *base_address, int width, int height, int pitch, int2 coord)
744701
{
745-
return __builtin_IB_subgroup_block_read_flat_u16_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
702+
return __builtin_IB_subgroup_block_read_flat_u8_m1k32v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
746703
}
747-
ushort4 intel_subgroup_block_read_u16_m4k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
704+
ushort2 intel_subgroup_block_read_u8_m2k32(const __global void *base_address, int width, int height, int pitch, int2 coord)
748705
{
749-
return __builtin_IB_subgroup_block_read_flat_u16_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
706+
return __builtin_IB_subgroup_block_read_flat_u8_m2k32v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
750707
}
751-
ushort8 intel_subgroup_block_read_u16_m8k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
708+
ushort4 intel_subgroup_block_read_u8_m4k32(const __global void *base_address, int width, int height, int pitch, int2 coord)
752709
{
753-
return __builtin_IB_subgroup_block_read_flat_u16_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
710+
return __builtin_IB_subgroup_block_read_flat_u8_m4k32v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
754711
}
755-
ushort16 intel_subgroup_block_read_u16_m16k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
712+
ushort8 intel_subgroup_block_read_u8_m8k32(const __global void *base_address, int width, int height, int pitch, int2 coord)
756713
{
757-
return __builtin_IB_subgroup_block_read_flat_u16_m16k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
758-
}
759-
void intel_subgroup_block_read_u16_m32k16(const __global void *base_address, int width, int height, int pitch, int2 coord, ushort8 dst[4])
760-
{
761-
ushort32 tmp = __builtin_IB_subgroup_block_read_flat_u16_m32k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
762-
dst[0] = tmp.lo.lo;
763-
dst[1] = tmp.lo.hi;
764-
dst[2] = tmp.hi.lo;
765-
dst[3] = tmp.hi.hi;
714+
return __builtin_IB_subgroup_block_read_flat_u8_m8k32v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
766715
}
767716

768-
void intel_subgroup_block_read_u16_m16k16v2(const __global void *base_address, int width, int height, int pitch, int2 coord, ushort8 dst[2][2])
769-
{
770-
ushort32 tmp = __builtin_IB_subgroup_block_read_flat_u16_m16k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
771-
dst[0][0] = tmp.lo.lo;
772-
dst[0][1] = tmp.lo.hi;
773-
dst[1][0] = tmp.hi.lo;
774-
dst[1][1] = tmp.hi.hi;
775-
}
776-
void intel_subgroup_block_read_u16_m32k16v2(const __global void *base_address, int width, int height, int pitch, int2 coord, ushort8 dst[2][4])
777-
{
778-
ushort64 tmp = __builtin_IB_subgroup_block_read_flat_u16_m32k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
779-
dst[0][0] = tmp.lo.lo.lo;
780-
dst[0][1] = tmp.lo.lo.hi;
781-
dst[0][2] = tmp.lo.hi.lo;
782-
dst[0][3] = tmp.lo.hi.hi;
783-
dst[1][0] = tmp.hi.lo.lo;
784-
dst[1][1] = tmp.hi.lo.hi;
785-
dst[1][2] = tmp.hi.hi.lo;
786-
dst[1][3] = tmp.hi.hi.hi;
787-
}
788-
789-
uint8 intel_subgroup_block_read_u32_m8k16(const __global void* base_address, int width, int height, int pitch, int2 coord)
717+
uint8 intel_subgroup_block_read_u32_m8k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
790718
{
791719
return __builtin_IB_subgroup_block_read_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
792720
}
793-
uint16 intel_subgroup_block_read_u32_m16k16(const __global void* base_address, int width, int height, int pitch, int2 coord)
794-
{
795-
return __builtin_IB_subgroup_block_read_flat_u32_m16k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
796-
}
797-
798-
// Each block is K rows x N columns, where the K rows have been VNNI transformed.
799-
int8 intel_subgroup_block_read_transform_u16_k16n16(__global void *base_address, int width, int height, int pitch, int2 coord)
800-
{
801-
// Note: this function is in the headers, but is named confusingly and returns unsigned integers rather than signed integers:
802-
return as_int8(intel_subgroup_block_read_transform_u16_k16(base_address, width, height, pitch, coord));
803-
}
804-
int16 intel_subgroup_block_read_transform_u16_k32n16(__global void *base_address, int width, int height, int pitch, int2 coord)
805-
{
806-
return as_int16(__builtin_IB_subgroup_block_read_flat_transform_u16_k32(as_long(base_address), width - 1, height - 1, pitch - 1, coord));
807-
}
808-
int16 intel_subgroup_block_read_transform_u16_k16n16v2(__global void *base_address, int width, int height, int pitch, int2 coord)
809-
{
810-
return as_int16(__builtin_IB_subgroup_block_read_flat_transform_u16_k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord));
811-
}
812-
void intel_subgroup_block_read_transform_u16_k32n16v2(__global void *base_address, int width, int height, int pitch, int2 coord, int8 dst[2][2])
813-
{
814-
uint32 tmp = __builtin_IB_subgroup_block_read_flat_transform_u16_k32v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
815-
dst[0][0] = as_int8(tmp.lo.lo);
816-
dst[0][1] = as_int8(tmp.lo.hi);
817-
dst[1][0] = as_int8(tmp.hi.lo);
818-
dst[1][1] = as_int8(tmp.hi.hi);
819-
}
820-
821721

822-
#define BLOCK_PREFETCH_CACHE_TYPE LSC_LDCC_L1C_L3C
823-
824-
void intel_subgroup_block_prefetch_u16_m1k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
825-
{
826-
#if defined(PREFETCH_DEFAULT)
827-
__builtin_IB_subgroup_block_read_prefetch_u16_m1k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
828-
#endif // defined(PREFETCH_DEFAULT)
829-
}
830-
void intel_subgroup_block_prefetch_u16_m2k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
831-
{
832-
#if defined(PREFETCH_DEFAULT)
833-
__builtin_IB_subgroup_block_read_prefetch_u16_m2k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
834-
#endif // defined(PREFETCH_DEFAULT)
835-
}
836-
void intel_subgroup_block_prefetch_u16_m4k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
837-
{
838-
#if defined(PREFETCH_DEFAULT)
839-
__builtin_IB_subgroup_block_read_prefetch_u16_m4k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
840-
#endif // defined(PREFETCH_DEFAULT)
841-
}
842-
void intel_subgroup_block_prefetch_u16_m8k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
843-
{
844-
#if defined(PREFETCH_DEFAULT)
845-
__builtin_IB_subgroup_block_read_prefetch_u16_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
846-
#endif // defined(PREFETCH_DEFAULT)
847-
}
848-
void intel_subgroup_block_prefetch_u16_m8k16v2(__global void *base_address, int width, int height, int pitch, int2 coord)
849-
{
850-
#if defined(PREFETCH_DEFAULT)
851-
__builtin_IB_subgroup_block_read_prefetch_u16_m8k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
852-
#endif // defined(PREFETCH_DEFAULT)
853-
}
854-
void intel_subgroup_block_prefetch_u16_m16k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
855-
{
856-
#if defined(PREFETCH_DEFAULT)
857-
__builtin_IB_subgroup_block_read_prefetch_u16_m16k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
858-
#endif // defined(PREFETCH_DEFAULT)
859-
}
860-
void intel_subgroup_block_prefetch_u16_m32k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
861-
{
862-
#if defined(PREFETCH_DEFAULT)
863-
__builtin_IB_subgroup_block_read_prefetch_u16_m32k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
864-
#endif // defined(PREFETCH_DEFAULT)
865-
}
866-
void intel_subgroup_block_prefetch_u16_m16k16v2(const __global void *base_address, int width, int height, int pitch, int2 coord)
867-
{
868-
#if defined(PREFETCH_DEFAULT)
869-
__builtin_IB_subgroup_block_read_prefetch_u16_m16k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
870-
#endif // defined(PREFETCH_DEFAULT)
871-
}
872-
void intel_subgroup_block_prefetch_u16_m32k16v2(const __global void *base_address, int width, int height, int pitch, int2 coord)
873-
{
874-
#if defined(PREFETCH_DEFAULT)
875-
__builtin_IB_subgroup_block_read_prefetch_u16_m32k16v2(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
876-
#endif // defined(PREFETCH_DEFAULT)
877-
}
878-
void intel_subgroup_block_prefetch_u32_m8k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
879-
{
880-
#if defined(PREFETCH_DEFAULT)
881-
__builtin_IB_subgroup_block_read_prefetch_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
882-
#endif // defined(PREFETCH_DEFAULT)
883-
}
884-
void intel_subgroup_block_prefetch_u32_m16k16(const __global void *base_address, int width, int height, int pitch, int2 coord)
885-
{
886-
#if defined(PREFETCH_DEFAULT)
887-
__builtin_IB_subgroup_block_read_prefetch_u32_m16k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, BLOCK_PREFETCH_CACHE_TYPE);
888-
#endif // defined(PREFETCH_DEFAULT)
889-
}
890722

723+
void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint data);
724+
void __builtin_IB_subgroup_block_write_flat_u32_m2k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint2 data);
725+
void __builtin_IB_subgroup_block_write_flat_u32_m4k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint4 data);
726+
void __builtin_IB_subgroup_block_write_flat_u32_m8k16v1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord, uint8 data);
891727

892728
void intel_subgroup_block_write_u32_m1k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint data)
893729
{
@@ -905,10 +741,5 @@ void intel_subgroup_block_write_u32_m8k16(__global void* base_address, int width
905741
{
906742
__builtin_IB_subgroup_block_write_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
907743
}
908-
void intel_subgroup_block_write_u32_m16k16(__global void* base_address, int width, int height, int pitch, int2 coord, uint16 data)
909-
{
910-
__builtin_IB_subgroup_block_write_flat_u32_m16k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
911-
}
912744

913745
#endif // cl_intel_subgroup_extended_block_read
914-
#endif

samples/99_matrixexperimentsi8/matrix_kernel_tiled_i8.cl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
#error "Needs to be updated!"
2-
31
#if !defined(tK)
42
#error "tK is undefined! This should be defined as the K dimension of the matrix tiles, which is dependent on the elemement type, likely 16 or 32."
53
#endif

0 commit comments

Comments
 (0)