Skip to content

Commit c1a6374

Browse files
committed
optimized zgemv_n kernel for sandybridge
1 parent dc05937 commit c1a6374

3 files changed

Lines changed: 14 additions & 5 deletions

File tree

kernel/x86_64/KERNEL.SANDYBRIDGE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
SGEMVNKERNEL = sgemv_n.c
22
SGEMVTKERNEL = sgemv_t.c
33

4+
ZGEMVNKERNEL = zgemv_n.c
5+
6+
47
SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
58
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
69
SGEMMITCOPY = ../generic/gemm_tcopy_16.c

kernel/x86_64/zgemv_n.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3131

3232
#if defined(HASWELL)
3333
#include "zgemv_n_microk_haswell-2.c"
34+
#elif defined(SANDYBRIDGE)
35+
#include "zgemv_n_microk_sandy-2.c"
3436
#endif
3537

3638

39+
3740
#define NBMAX 1024
3841

3942
#ifndef HAVE_KERNEL_16x4

kernel/x86_64/zgemv_n_microk_sandy-2.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,39 +50,42 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
5050
".align 16 \n\t"
5151
".L01LOOP%=: \n\t"
5252

53+
"prefetcht0 256(%4,%0,8) \n\t"
5354
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
5455
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
5556

5657
"vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t"
5758
"vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t"
59+
"prefetcht0 256(%5,%0,8) \n\t"
5860
"vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t"
59-
"vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
60-
6161
"vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0
62+
"vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
6263
"vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0
6364

6465
"vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
6566
"vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
6667
"vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
6768
"vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
69+
"prefetcht0 256(%6,%0,8) \n\t"
6870
"vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
6971
"vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
72+
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0
7073
"vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t"
7174
"vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
7275

73-
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0
7476
"vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a0
7577

7678
"vmulpd %%ymm8 , %%ymm4 , %%ymm10 \n\t"
7779
"vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
7880
"vmulpd %%ymm8 , %%ymm5 , %%ymm11 \n\t"
7981
"vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
82+
"prefetcht0 256(%7,%0,8) \n\t"
8083
"vmulpd %%ymm9 , %%ymm4 , %%ymm10 \n\t"
8184
"vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
85+
"vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0
8286
"vmulpd %%ymm9 , %%ymm5 , %%ymm11 \n\t"
8387
"vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
8488

85-
"vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0
8689
"vmovups 32(%7,%0,8), %%ymm9 \n\t" // 2 complex values form a0
8790

8891
"vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t"
@@ -94,7 +97,7 @@ static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
9497
"vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t"
9598
"vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
9699

97-
"prefetcht0 192(%3,%0,8) \n\t"
100+
"prefetcht0 256(%3,%0,8) \n\t"
98101
"vmovups (%3,%0,8), %%ymm10 \n\t"
99102
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
100103

0 commit comments

Comments
 (0)