Skip to content

Commit ddc93ff

Browse files
committed
add transpose block read variant
1 parent 41159a8 commit ddc93ff

3 files changed

Lines changed: 204 additions & 0 deletions

File tree

samples/99_matrixexperimentsi8/main.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,23 @@ static void compute_reference(
142142
}
143143
}
144144

145+
template <typename DstT, typename SrcT>
146+
static void compute_reference_TN(
147+
std::vector<DstT>& C,
148+
const std::vector<SrcT>& A, const std::vector<SrcT>& B,
149+
size_t M, size_t N, size_t K)
150+
{
151+
for (size_t m = 0; m < M; m++) {
152+
for (size_t n = 0; n < N; n++) {
153+
DstT sum = 0;
154+
for (size_t k = 0; k < K; k++) {
155+
sum = A[k * K + m] * B[k * N + n] + sum;
156+
}
157+
C[m * N + n] = sum;
158+
}
159+
}
160+
}
161+
145162
template <typename T>
146163
void check_results(
147164
size_t M,
@@ -660,6 +677,107 @@ static void i8_dpas_blockread_vnni_tiled(
660677
}
661678
}
662679

680+
static void i8_naive_TN(
681+
cl::Context& context, cl::Program& program, cl::CommandQueue& queue,
682+
cl::Buffer& C, cl::Buffer& A, cl::Buffer& B,
683+
size_t M, size_t N, size_t K,
684+
const std::vector<int>& C_ref)
685+
{
686+
printf("%80s: ", makeTestName(__FUNCTION__, M, N, K).c_str()); fflush(stdout);
687+
688+
cl::Kernel kernel{program, "i8_naive_TN"};
689+
if (kernel() == nullptr) {
690+
printf("unsupported.\n");
691+
} else {
692+
kernel.setArg(0, C);
693+
kernel.setArg(1, A);
694+
kernel.setArg(2, B);
695+
kernel.setArg(3, static_cast<cl_int>(K));
696+
697+
if (!skipinit) {
698+
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));
699+
}
700+
701+
float best = 999.0f;
702+
for (int test = 0; test < testIterations; test++) {
703+
cl::Event event;
704+
auto start = test_clock::now();
705+
queue.enqueueNDRangeKernel(kernel, cl::NullRange,
706+
cl::NDRange{N, M}, cl::NullRange, nullptr, &event);
707+
queue.finish();
708+
auto end = test_clock::now();
709+
std::chrono::duration<float> sw_time = end - start;
710+
auto elapsed = wallclock ? sw_time.count() : hw_time(event);
711+
best = std::min(best, elapsed);
712+
}
713+
auto gops = 2.0 * M * N * K / best / 1e9;
714+
printf("Best in %f seconds (%f gops)\n", best, gops);
715+
716+
if (validate) {
717+
printf("Checking results... "); fflush(stdout);
718+
std::vector<int> C_check(C_ref.size());
719+
queue.enqueueReadBuffer(C, CL_TRUE, 0, C_check.size() * sizeof(C_check[0]), C_check.data());
720+
check_results(M, N, C_check, C_ref);
721+
printf(" done!\n");
722+
}
723+
}
724+
}
725+
726+
template<int tM, int tN>
727+
static void i8_dpas_blockread_rowmajor_TN(
728+
cl::Context& context, cl::Program& program, cl::CommandQueue& queue,
729+
cl::Buffer& C, cl::Buffer& A, cl::Buffer& B,
730+
size_t M, size_t N, size_t K,
731+
const std::vector<int>& C_ref)
732+
{
733+
printf("%80s: ", makeTestName(__FUNCTION__, tM, tN, M, N, K).c_str()); fflush(stdout);
734+
735+
std::string kernelName = "i8_dpas_blockread_rowmajor_TN";
736+
kernelName += "_m" + std::to_string(tM);
737+
kernelName += "_n" + std::to_string(tN);
738+
cl::Kernel kernel{program, kernelName.c_str()};
739+
if (kernel() == nullptr) {
740+
printf("unsupported.\n");
741+
} else if (K < 64 || N < 64/4) {
742+
printf("matrix pitch for block reads must be >= 64 bytes.\n");
743+
} else {
744+
kernel.setArg(0, C);
745+
kernel.setArg(1, A);
746+
kernel.setArg(2, B);
747+
kernel.setArg(3, static_cast<cl_int>(K));
748+
if (roundRobin) {
749+
setRoundRobin(kernel);
750+
}
751+
752+
if (!skipinit) {
753+
queue.enqueueFillBuffer(C, 0, 0, C_ref.size() * sizeof(C_ref[0]));
754+
}
755+
756+
float best = 999.0f;
757+
for (int test = 0; test < testIterations; test++) {
758+
cl::Event event;
759+
auto start = test_clock::now();
760+
queue.enqueueNDRangeKernel(kernel, cl::NullRange,
761+
cl::NDRange{N, M/tM}, cl::NullRange, nullptr, &event);
762+
queue.finish();
763+
auto end = test_clock::now();
764+
std::chrono::duration<float> sw_time = end - start;
765+
auto elapsed = wallclock ? sw_time.count() : hw_time(event);
766+
best = std::min(best, elapsed);
767+
}
768+
auto gops = 2.0 * M * N * K / best / 1e9;
769+
printf("Best in %f seconds (%f gops)\n", best, gops);
770+
771+
if (validate) {
772+
printf("Checking results... "); fflush(stdout);
773+
std::vector<int> C_check(C_ref.size());
774+
queue.enqueueReadBuffer(C, CL_TRUE, 0, C_check.size() * sizeof(C_check[0]), C_check.data());
775+
check_results(M, N, C_check, C_ref);
776+
printf(" done!\n");
777+
}
778+
}
779+
}
780+
663781
int main(int argc, char** argv)
664782
{
665783
int platformIndex = 0;
@@ -784,6 +902,7 @@ int main(int argc, char** argv)
784902
std::vector<int8_t> Bvnni_vec(K * N);
785903

786904
std::vector<int> C_ref(M * N);
905+
std::vector<int> C_TN_ref(M * N);
787906

788907
printf("Initializing source matrices...\n");
789908
fill_matrix(A_vec, M, K);
@@ -794,6 +913,8 @@ int main(int argc, char** argv)
794913
if (validate) {
795914
printf("Computing reference...\n");
796915
compute_reference(C_ref, A_vec, B_vec, M, N, K);
916+
printf("Computing transposed reference...\n");
917+
compute_reference_TN(C_TN_ref, A_vec, B_vec, M, N, K);
797918
}
798919

799920
printf("Creating source buffers...\n");
@@ -910,6 +1031,11 @@ int main(int argc, char** argv)
9101031
i8_dpas_blockread_vnni_tiled<8, 16, 4, 4>(context, program, queue, C, A, Bvnni, M, N, K, C_ref);
9111032
}
9121033

1034+
if (mask & 0x2000) {
1035+
//i8_naive_TN(context, program, queue, C, A, B, M, N, K, C_TN_ref);
1036+
i8_dpas_blockread_rowmajor_TN<4, 16>(context, program, queue, C, A, B, M, N, K, C_TN_ref);
1037+
}
1038+
9131039
printf("Done.\n");
9141040

9151041
return 0;

samples/99_matrixexperimentsi8/matrix_helpers_i8.cl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,4 +742,20 @@ void intel_subgroup_block_write_u32_m8k16(__global void* base_address, int width
742742
__builtin_IB_subgroup_block_write_flat_u32_m8k16v1(as_long(base_address), width - 1, height - 1, pitch - 1, coord, data);
743743
}
744744

745+
uint __builtin_IB_subgroup_block_read_flat_transpose_u32_k1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
746+
uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_m32k1(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);
747+
748+
void intel_sub_group_2d_block_read_transpose_32b_16r1x1c(global void* base_address, int width, int height, int pitch, int2 coord, private uint* destination)
749+
{
750+
uint temp = __builtin_IB_subgroup_block_read_flat_transpose_u32_k1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
751+
destination[0] = temp;
752+
}
753+
754+
void intel_sub_group_2d_block_read_transpose_32b_32r1x1c(global void* base_address, int width, int height, int pitch, int2 coord, private uint* destination)
755+
{
756+
uint2 temp = __builtin_IB_subgroup_block_read_flat_transpose_u32_m32k1(as_long(base_address), width - 1, height - 1, pitch - 1, coord);
757+
destination[0] = temp.s0;
758+
destination[1] = temp.s1;
759+
}
760+
745761
#endif // cl_intel_subgroup_extended_block_read

samples/99_matrixexperimentsi8/matrix_kernels_i8.cl

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,24 @@ kernel void i8_naive(global int* C, global char* A, global char* B, int K)
2727
C[m * N + n] = sum;
2828
}
2929

30+
kernel void i8_naive_TN(global int* C, global char* A, global char* B, int K)
31+
{
32+
const int N = get_global_size(0);
33+
const int m = get_global_id(1);
34+
const int n = get_global_id(0);
35+
36+
int sum = 0;
37+
for (int k = 0; k < K; k++) {
38+
sum = A[k * K + m] * B[k * N + n] + sum;
39+
if (get_global_id(0) == 1 && get_global_id(1) == 0) {
40+
printf("after iteration %d: sum is %d\n", k, sum);
41+
}
42+
}
43+
44+
sum = activation(sum);
45+
C[m * N + n] = sum;
46+
}
47+
3048
// For all i8 kernels tK == 32:
3149
#define tK 32
3250

@@ -574,6 +592,50 @@ kernel void i8_dpas_blockread_vnni_m8_n16(global int* C, global char* A, global
574592
intel_sub_group_2d_block_write_32b_8r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
575593
}
576594

595+
__attribute__((intel_reqd_sub_group_size(16))) __attribute__((reqd_work_group_size(16, 1, 1)))
596+
kernel void i8_dpas_blockread_rowmajor_TN_m4_n16(global int* C, global char* A, global char* B, int K)
597+
{
598+
__builtin_assume(K > 0); // Always at least one K iteration.
599+
const int tM = 4;
600+
const int tN = 16;
601+
const int M = get_global_size(1) * tM;
602+
const int N = get_global_size(0);
603+
const int m = get_group_id(1) * tM;
604+
const int n = get_group_id(0) * tN;
605+
const int sglid = get_sub_group_local_id();
606+
607+
int4 sum = 0;
608+
for (int k = 0; k < K; k += tK) {
609+
int2 readData;
610+
intel_sub_group_2d_block_read_transpose_32b_32r1x1c(A, M * sizeof(char), K, M * sizeof(char), (int2)(m / 4, k), (uint*)&readData);
611+
612+
// Note: after the transpose block read:
613+
// readData.s0 contains row 0-15
614+
// readData.s1 contains row 16-31
615+
// So, WI0 has rows 0 and 16, WI1 has rows 1 and 17, etc.
616+
// We want WI0 to have rows 0 and 1, WI1 to have rows 2 and 3, etc.
617+
int shuffledData0 = (sglid < 8) ?
618+
sub_group_shuffle(readData.s0, (sglid * 2)) :
619+
sub_group_shuffle(readData.s1, (sglid * 2) % 16);
620+
int shuffledData1 = (sglid < 8) ?
621+
sub_group_shuffle(readData.s0, (sglid * 2) + 1) :
622+
sub_group_shuffle(readData.s1, (sglid * 2) % 16 + 1);
623+
624+
short4 aData;
625+
aData.s0 = as_short((char2)(as_char4(shuffledData0).s0, as_char4(shuffledData1).s0));
626+
aData.s1 = as_short((char2)(as_char4(shuffledData0).s1, as_char4(shuffledData1).s1));
627+
aData.s2 = as_short((char2)(as_char4(shuffledData0).s2, as_char4(shuffledData1).s2));
628+
aData.s3 = as_short((char2)(as_char4(shuffledData0).s3, as_char4(shuffledData1).s3));
629+
630+
int8 bData;
631+
intel_sub_group_2d_block_read_transform_8b_32r16x1c(B, N * sizeof(char), K, N * sizeof(char), (int2)(n, k), (uint*)&bData);
632+
sum = mat_mul_sg16(aData, bData, sum);
633+
}
634+
635+
sum = activation(sum);
636+
intel_sub_group_2d_block_write_32b_4r16x1c(C, N * sizeof(float), M, N * sizeof(float), (int2)(n, m), (uint*)&sum);
637+
}
638+
577639
#endif // cl_intel_subgroup_2d_block_io
578640

579641
#if 0 // disable the tiled cases for now

0 commit comments

Comments
 (0)