Skip to content

Commit 9528f0d

Browse files
committed
bugfix in zgemv_n_microk_sandy-2.c
1 parent b065505 commit 9528f0d

1 file changed

Lines changed: 10 additions & 25 deletions

File tree

kernel/x86_64/zgemv_n_microk_sandy-2.c

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -50,22 +50,13 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
5050
".align 16 \n\t"
5151
".L01LOOP%=: \n\t"
5252

53-
"vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
54-
"vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
55-
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
56-
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
57-
5853
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
5954
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
6055

61-
"vmulpd %%ymm8 , %%ymm0 , %%ymm10 \n\t"
62-
"vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
63-
"vmulpd %%ymm8 , %%ymm1 , %%ymm11 \n\t"
64-
"vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
65-
"vmulpd %%ymm9 , %%ymm0 , %%ymm10 \n\t"
66-
"vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
67-
"vmulpd %%ymm9 , %%ymm1 , %%ymm11 \n\t"
68-
"vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
56+
"vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t"
57+
"vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t"
58+
"vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t"
59+
"vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
6960

7061
"vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0
7162
"vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0
@@ -103,6 +94,10 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
10394
"vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t"
10495
"vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
10596

97+
"prefetcht0 192(%3,%0,8) \n\t"
98+
"vmovups (%3,%0,8), %%ymm10 \n\t"
99+
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
100+
106101
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
107102
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
108103
"vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
@@ -117,18 +112,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
117112
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
118113
#endif
119114

120-
"prefetcht0 192(%3,%0,8) \n\t"
121-
"vmovups (%3,%0,8), %%ymm12 \n\t"
122-
"vmovups 32(%3,%0,8), %%ymm13 \n\t"
123-
124-
#if !defined(XCONJ)
125-
"vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t"
126-
"vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t"
127-
#else
128-
"vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t"
129-
"vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t"
130-
#endif
131-
115+
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
116+
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
132117

133118
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
134119
"vmovups %%ymm13, 32(%3,%0,8) \n\t"

0 commit comments

Comments
 (0)