@@ -50,22 +50,13 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
5050 ".align 16 \n\t"
5151 ".L01LOOP%=: \n\t"
5252
53- "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
54- "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
55- "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
56- "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
57-
5853 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
5954 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
6055
61- "vmulpd %%ymm8 , %%ymm0 , %%ymm10 \n\t"
62- "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
63- "vmulpd %%ymm8 , %%ymm1 , %%ymm11 \n\t"
64- "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
65- "vmulpd %%ymm9 , %%ymm0 , %%ymm10 \n\t"
66- "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
67- "vmulpd %%ymm9 , %%ymm1 , %%ymm11 \n\t"
68- "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
56+ "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t"
57+ "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t"
58+ "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t"
59+ "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
6960
7061 "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0
7162 "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0
@@ -103,6 +94,10 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
10394 "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t"
10495 "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
10596
97+ "prefetcht0 192(%3,%0,8) \n\t"
98+ "vmovups (%3,%0,8), %%ymm10 \n\t"
99+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
100+
106101#if ( !defined(CONJ ) && !defined (XCONJ ) ) || ( defined (CONJ ) && defined (XCONJ ) )
107102 "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
108103 "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
@@ -117,18 +112,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
117112 "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
118113#endif
119114
120- "prefetcht0 192(%3,%0,8) \n\t"
121- "vmovups (%3,%0,8), %%ymm12 \n\t"
122- "vmovups 32(%3,%0,8), %%ymm13 \n\t"
123-
124- #if !defined(XCONJ )
125- "vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t"
126- "vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t"
127- #else
128- "vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t"
129- "vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t"
130- #endif
131-
115+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
116+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
132117
133118 "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
134119 "vmovups %%ymm13, 32(%3,%0,8) \n\t"
0 commit comments