Skip to content

Commit 4568d32

Browse files
committed
added optimized cgemv_t kernel for haswell
1 parent c1a6374 commit 4568d32

3 files changed

Lines changed: 180 additions & 11 deletions

File tree

kernel/x86_64/KERNEL.HASWELL

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ DGEMVTKERNEL = dgemv_t.c
66
ZGEMVNKERNEL = zgemv_n.c
77
ZGEMVTKERNEL = zgemv_t.c
88

9+
CGEMVTKERNEL = cgemv_t.c
10+
911
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
1012
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
1113
SGEMMITCOPY = ../generic/gemm_tcopy_16.c

kernel/x86_64/cgemv_t.c

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828

2929
#include "common.h"
3030

31-
/*
32-
#if defined(BULLDOZER)
33-
#include "zgemv_t_microk_bulldozer-2.c"
34-
#elif defined(HASWELL)
35-
#include "zgemv_t_microk_haswell-2.c"
31+
#if defined(HASWELL)
32+
#include "cgemv_t_microk_haswell-2.c"
3633
#endif
37-
*/
3834

3935
#define NBMAX 2048
4036

4137
#ifndef HAVE_KERNEL_16x4
4238

43-
static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
39+
static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
4440
{
4541
BLASLONG i;
4642
FLOAT *a0,*a1,*a2,*a3;
@@ -92,7 +88,7 @@ static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
9288

9389
#endif
9490

95-
static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
91+
static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
9692
{
9793
BLASLONG i;
9894
FLOAT *a0;
@@ -113,7 +109,7 @@ static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
113109
*y = temp_r;
114110
*(y+1) = temp_i;
115111
}
116-
112+
117113
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
118114
{
119115
BLASLONG i;
@@ -176,7 +172,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
176172
ap[1] = a_ptr + lda;
177173
ap[2] = ap[1] + lda;
178174
ap[3] = ap[2] + lda;
179-
zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
175+
cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
180176
a_ptr += 4 * lda;
181177

182178
#if !defined(XCONJ)
@@ -210,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
210206

211207
for( i = 0; i < n2 ; i++)
212208
{
213-
zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
209+
cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
214210
a_ptr += 1 * lda;
215211

216212
#if !defined(XCONJ)
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/***************************************************************************
2+
Copyright (c) 2014, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary froms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary from must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_16x4 1
29+
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
30+
31+
static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
32+
{
33+
34+
BLASLONG register i = 0;
35+
36+
__asm__ __volatile__
37+
(
38+
"vzeroupper \n\t"
39+
40+
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
41+
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
42+
"vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
43+
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
44+
"vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp
45+
"vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
46+
"vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
47+
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
48+
49+
".align 16 \n\t"
50+
".L01LOOP%=: \n\t"
51+
"prefetcht0 192(%4,%0,4) \n\t"
52+
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
53+
"prefetcht0 192(%5,%0,4) \n\t"
54+
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
55+
56+
"prefetcht0 192(%2,%0,4) \n\t"
57+
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
58+
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
59+
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
60+
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
61+
62+
"prefetcht0 192(%6,%0,4) \n\t"
63+
"vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
64+
"prefetcht0 192(%7,%0,4) \n\t"
65+
"vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
66+
67+
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
68+
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
69+
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
70+
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
71+
"vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
72+
"vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
73+
"vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
74+
"vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
75+
76+
"vmovups 32(%4,%0,4), %%ymm4 \n\t" // 2 complex values from a0
77+
"vmovups 32(%5,%0,4), %%ymm5 \n\t" // 2 complex values from a1
78+
79+
"vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
80+
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
81+
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
82+
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
83+
84+
"vmovups 32(%6,%0,4), %%ymm6 \n\t" // 2 complex values from a2
85+
"vmovups 32(%7,%0,4), %%ymm7 \n\t" // 2 complex values from a3
86+
87+
"vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
88+
"vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
89+
"vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
90+
"vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
91+
"vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
92+
"vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
93+
"vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
94+
"vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
95+
96+
"addq $16 , %0 \n\t"
97+
"subq $8 , %1 \n\t"
98+
"jnz .L01LOOP%= \n\t"
99+
100+
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
101+
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
102+
"vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
103+
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
104+
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
105+
"vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
106+
"vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
107+
"vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t"
108+
"vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t"
109+
#else
110+
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
111+
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
112+
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
113+
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
114+
"vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
115+
"vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
116+
"vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t"
117+
"vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t"
118+
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
119+
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
120+
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
121+
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
122+
#endif
123+
124+
"vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
125+
"vextractf128 $1, %%ymm10, %%xmm11 \n\t"
126+
"vextractf128 $1, %%ymm12, %%xmm13 \n\t"
127+
"vextractf128 $1, %%ymm14, %%xmm15 \n\t"
128+
129+
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
130+
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
131+
"vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
132+
"vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
133+
134+
"vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
135+
"vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
136+
"vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t"
137+
"vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t"
138+
139+
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
140+
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
141+
"vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
142+
"vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
143+
144+
"vmovsd %%xmm8 , (%3) \n\t"
145+
"vmovsd %%xmm10, 8(%3) \n\t"
146+
"vmovsd %%xmm12, 16(%3) \n\t"
147+
"vmovsd %%xmm14, 24(%3) \n\t"
148+
149+
"vzeroupper \n\t"
150+
151+
:
152+
:
153+
"r" (i), // 0
154+
"r" (n), // 1
155+
"r" (x), // 2
156+
"r" (y), // 3
157+
"r" (ap[0]), // 4
158+
"r" (ap[1]), // 5
159+
"r" (ap[2]), // 6
160+
"r" (ap[3]) // 7
161+
: "cc",
162+
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
163+
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
164+
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
165+
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
166+
"memory"
167+
);
168+
169+
}
170+
171+

0 commit comments

Comments
 (0)