Skip to content

Commit 11e34dd

Browse files
committed
bugfix for zgemv_n_microk_haswell-2.c
1 parent 9528f0d commit 11e34dd

3 files changed

Lines changed: 20 additions & 25 deletions

File tree

kernel/x86_64/KERNEL.HASWELL

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@ SGEMVTKERNEL = sgemv_t.c
33
DGEMVNKERNEL = dgemv_n.c
44
DGEMVTKERNEL = dgemv_t.c
55

6-
ifndef OS_WINDOWS
76
ZGEMVNKERNEL = zgemv_n.c
8-
endif
9-
ZGEMVTKERNEL = zgemv_t.c
7+
#ZGEMVTKERNEL = zgemv_t.c
108

119
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
1210
SGEMMINCOPY = ../generic/gemm_ncopy_16.c

kernel/x86_64/zgemv_n.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2525
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

28-
28+
#include <stdlib.h>
29+
#include <stdio.h>
2930
#include "common.h"
3031

3132
#if defined(HASWELL)
@@ -141,6 +142,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
141142
BLASLONG n2;
142143
FLOAT xbuffer[8],*ybuffer;
143144

145+
146+
#if 0
147+
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
148+
#endif
149+
150+
if ( m < 1 ) return(0);
151+
if ( n < 1 ) return(0);
152+
144153
ybuffer = buffer;
145154

146155
inc_x *= 2;

kernel/x86_64/zgemv_n_microk_haswell-2.c

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -53,19 +53,14 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
5353
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
5454
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
5555

56-
"vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
57-
"vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
58-
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
59-
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
60-
6156
"prefetcht0 192(%5,%0,8) \n\t"
6257
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
6358
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
6459

65-
"vfmadd231pd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
66-
"vfmadd231pd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
67-
"vfmadd231pd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
68-
"vfmadd231pd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
60+
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
61+
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
62+
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
63+
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
6964

7065
"prefetcht0 192(%6,%0,8) \n\t"
7166
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
@@ -90,6 +85,9 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
9085
"vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
9186
"vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
9287

88+
"prefetcht0 192(%3,%0,8) \n\t"
89+
"vmovups (%3,%0,8), %%ymm10 \n\t"
90+
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
9391

9492
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
9593
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
@@ -105,18 +103,8 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
105103
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
106104
#endif
107105

108-
"prefetcht0 192(%3,%0,8) \n\t"
109-
"vmovups (%3,%0,8), %%ymm12 \n\t"
110-
"vmovups 32(%3,%0,8), %%ymm13 \n\t"
111-
112-
#if !defined(XCONJ)
113-
"vaddpd %%ymm8, %%ymm12, %%ymm12 \n\t"
114-
"vaddpd %%ymm9, %%ymm13, %%ymm13 \n\t"
115-
#else
116-
"vaddsubpd %%ymm12, %%ymm8, %%ymm12 \n\t"
117-
"vaddsubpd %%ymm13, %%ymm9, %%ymm13 \n\t"
118-
#endif
119-
106+
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
107+
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
120108

121109
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
122110
"vmovups %%ymm13, 32(%3,%0,8) \n\t"

0 commit comments

Comments
 (0)