@@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
5353 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
5454 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
5555
56- "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
57- "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
58- "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
59- "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
60-
6156 "prefetcht0 192(%5,%0,8) \n\t"
6257 "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
6358 "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
6459
65- "vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
66- "vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
67- "vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
68- "vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
60+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
61+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
62+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
63+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
6964
7065 "prefetcht0 192(%6,%0,8) \n\t"
7166 "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
@@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
9085 "vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
9186 "vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
9287
88+ "prefetcht0 192(%3,%0,8) \n\t"
89+ "vmovups (%3,%0,8), %%ymm10 \n\t"
90+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
9391
9492#if ( !defined(CONJ ) && !defined (XCONJ ) ) || ( defined (CONJ ) && defined (XCONJ ) )
9593 "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
@@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
105103 "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
106104#endif
107105
108- "prefetcht0 192(%3,%0,8) \n\t"
109- "vmovups (%3,%0,8), %%ymm12 \n\t"
110- "vmovups 32(%3,%0,8), %%ymm13 \n\t"
111-
112- #if !defined(XCONJ )
113- "vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t"
114- "vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t"
115- #else
116- "vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t"
117- "vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t"
118- #endif
119-
106+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
107+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
120108
121109 "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
122110 "vmovups %%ymm13, 32(%3,%0,8) \n\t"
0 commit comments