Skip to content

Commit 793f2d4

Browse files
committed
added optimized sgemv_n kernel for nehalem
1 parent a4dde45 commit 793f2d4

3 files changed

Lines changed: 153 additions & 0 deletions

File tree

kernel/x86_64/KERNEL.NEHALEM

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
ifdef OS_WINDOWS
2+
SGEMVNKERNEL = ../arm/gemv_n.c
3+
SGEMVTKERNEL = ../arm/gemv_t.c
4+
else
5+
SGEMVNKERNEL = sgemv_n.c
6+
endif
7+
18

29
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
310
SGEMMINCOPY = gemm_ncopy_4.S

kernel/x86_64/sgemv_n.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535
#include "sgemv_n_microk_haswell-2.c"
3636
#elif defined(SANDYBRIDGE)
3737
#include "sgemv_n_microk_sandy-2.c"
38+
#elif defined(NEHALEM)
39+
#include "sgemv_n_microk_nehalem-2.c"
3840
#endif
3941

4042

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/***************************************************************************
2+
Copyright (c) 2014, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_16x4 1
29+
static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y) __attribute__ ((noinline));
30+
31+
static void sgemv_kernel_16x4( long n, float **ap, float *x, float *y)
32+
{
33+
34+
long register i = 0;
35+
36+
__asm__ __volatile__
37+
(
38+
"movss (%2), %%xmm12 \n\t" // x0
39+
"movss 4(%2), %%xmm13 \n\t" // x1
40+
"movss 8(%2), %%xmm14 \n\t" // x2
41+
"movss 12(%2), %%xmm15 \n\t" // x3
42+
"shufps $0, %%xmm12, %%xmm12\n\t"
43+
"shufps $0, %%xmm13, %%xmm13\n\t"
44+
"shufps $0, %%xmm14, %%xmm14\n\t"
45+
"shufps $0, %%xmm15, %%xmm15\n\t"
46+
47+
".align 16 \n\t"
48+
".L01LOOP%=: \n\t"
49+
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
50+
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
51+
"movups 32(%3,%0,4), %%xmm6 \n\t" // 4 * y
52+
"movups 48(%3,%0,4), %%xmm7 \n\t" // 4 * y
53+
54+
"prefetcht0 192(%4,%0,4) \n\t"
55+
56+
"movups (%4,%0,4), %%xmm8 \n\t"
57+
"movups 16(%4,%0,4), %%xmm9 \n\t"
58+
"movups 32(%4,%0,4), %%xmm10 \n\t"
59+
"movups 48(%4,%0,4), %%xmm11 \n\t"
60+
"mulps %%xmm12, %%xmm8 \n\t"
61+
"addps %%xmm8 , %%xmm4 \n\t"
62+
"mulps %%xmm12, %%xmm9 \n\t"
63+
"addps %%xmm9 , %%xmm5 \n\t"
64+
"mulps %%xmm12, %%xmm10 \n\t"
65+
"addps %%xmm10, %%xmm6 \n\t"
66+
"mulps %%xmm12, %%xmm11 \n\t"
67+
"addps %%xmm11, %%xmm7 \n\t"
68+
69+
"prefetcht0 192(%5,%0,4) \n\t"
70+
71+
"movups (%5,%0,4), %%xmm8 \n\t"
72+
"movups 16(%5,%0,4), %%xmm9 \n\t"
73+
"movups 32(%5,%0,4), %%xmm10 \n\t"
74+
"movups 48(%5,%0,4), %%xmm11 \n\t"
75+
"mulps %%xmm13, %%xmm8 \n\t"
76+
"addps %%xmm8 , %%xmm4 \n\t"
77+
"mulps %%xmm13, %%xmm9 \n\t"
78+
"addps %%xmm9 , %%xmm5 \n\t"
79+
"mulps %%xmm13, %%xmm10 \n\t"
80+
"addps %%xmm10, %%xmm6 \n\t"
81+
"mulps %%xmm13, %%xmm11 \n\t"
82+
"addps %%xmm11, %%xmm7 \n\t"
83+
84+
"prefetcht0 192(%6,%0,4) \n\t"
85+
86+
"movups (%6,%0,4), %%xmm8 \n\t"
87+
"movups 16(%6,%0,4), %%xmm9 \n\t"
88+
"movups 32(%6,%0,4), %%xmm10 \n\t"
89+
"movups 48(%6,%0,4), %%xmm11 \n\t"
90+
"mulps %%xmm14, %%xmm8 \n\t"
91+
"addps %%xmm8 , %%xmm4 \n\t"
92+
"mulps %%xmm14, %%xmm9 \n\t"
93+
"addps %%xmm9 , %%xmm5 \n\t"
94+
"mulps %%xmm14, %%xmm10 \n\t"
95+
"addps %%xmm10, %%xmm6 \n\t"
96+
"mulps %%xmm14, %%xmm11 \n\t"
97+
"addps %%xmm11, %%xmm7 \n\t"
98+
99+
"prefetcht0 192(%7,%0,4) \n\t"
100+
101+
"movups (%7,%0,4), %%xmm8 \n\t"
102+
"movups 16(%7,%0,4), %%xmm9 \n\t"
103+
"movups 32(%7,%0,4), %%xmm10 \n\t"
104+
"movups 48(%7,%0,4), %%xmm11 \n\t"
105+
"mulps %%xmm15, %%xmm8 \n\t"
106+
"addps %%xmm8 , %%xmm4 \n\t"
107+
"mulps %%xmm15, %%xmm9 \n\t"
108+
"addps %%xmm9 , %%xmm5 \n\t"
109+
"mulps %%xmm15, %%xmm10 \n\t"
110+
"addps %%xmm10, %%xmm6 \n\t"
111+
"mulps %%xmm15, %%xmm11 \n\t"
112+
"addps %%xmm11, %%xmm7 \n\t"
113+
114+
115+
"movups %%xmm4, (%3,%0,4) \n\t" // 4 * y
116+
"movups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
117+
"movups %%xmm6, 32(%3,%0,4) \n\t" // 4 * y
118+
"movups %%xmm7, 48(%3,%0,4) \n\t" // 4 * y
119+
120+
"addq $16, %0 \n\t"
121+
"subq $16, %1 \n\t"
122+
"jnz .L01LOOP%= \n\t"
123+
124+
:
125+
:
126+
"r" (i), // 0
127+
"r" (n), // 1
128+
"r" (x), // 2
129+
"r" (y), // 3
130+
"r" (ap[0]), // 4
131+
"r" (ap[1]), // 5
132+
"r" (ap[2]), // 6
133+
"r" (ap[3]) // 7
134+
: "cc",
135+
"%xmm4", "%xmm5",
136+
"%xmm6", "%xmm7",
137+
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
138+
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
139+
"memory"
140+
);
141+
142+
}
143+
144+

0 commit comments

Comments
 (0)