@@ -50,39 +50,42 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
5050 ".align 16 \n\t"
5151 ".L01LOOP%=: \n\t"
5252
53+ "prefetcht0 256(%4,%0,8) \n\t"
5354 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
5455 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
5556
5657 "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t"
5758 "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t"
59+ "prefetcht0 256(%5,%0,8) \n\t"
5860 "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t"
59- "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
60-
6161 "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0
62+ "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
6263 "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0
6364
6465 "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
6566 "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
6667 "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
6768 "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
69+ "prefetcht0 256(%6,%0,8) \n\t"
6870 "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
6971 "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
72+ "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0
7073 "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t"
7174 "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
7275
73- "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0
7476 "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a0
7577
7678 "vmulpd %%ymm8 , %%ymm4 , %%ymm10 \n\t"
7779 "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
7880 "vmulpd %%ymm8 , %%ymm5 , %%ymm11 \n\t"
7981 "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
82+ "prefetcht0 256(%7,%0,8) \n\t"
8083 "vmulpd %%ymm9 , %%ymm4 , %%ymm10 \n\t"
8184 "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
85+ "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0
8286 "vmulpd %%ymm9 , %%ymm5 , %%ymm11 \n\t"
8387 "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
8488
85- "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0
8689 "vmovups 32(%7,%0,8), %%ymm9 \n\t" // 2 complex values form a0
8790
8891 "vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t"
@@ -94,7 +97,7 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
9497 "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t"
9598 "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
9699
97- "prefetcht0 192 (%3,%0,8) \n\t"
100+ "prefetcht0 256 (%3,%0,8) \n\t"
98101 "vmovups (%3,%0,8), %%ymm10 \n\t"
99102 "vmovups 32(%3,%0,8), %%ymm11 \n\t"
100103
0 commit comments