@@ -52,7 +52,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
5252 "subq $4 , %1 \n\t"
5353 "jz 2f \n\t"
5454
55- ".align 16 \n\t"
55+ // ".align 16 \n\t"
5656 "1: \n\t"
5757
5858 "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
@@ -114,3 +114,78 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
114114}
115115
116116
117+ #define HAVE_KERNEL_4x2
118+
119+ static void dgemv_kernel_4x2 ( BLASLONG n , FLOAT * * ap , FLOAT * x , FLOAT * y , FLOAT * alpha ) __attribute__ ((noinline ));
120+
121+ static void dgemv_kernel_4x2 ( BLASLONG n , FLOAT * * ap , FLOAT * x , FLOAT * y , FLOAT * alpha )
122+ {
123+
124+ BLASLONG register i = 0 ;
125+
126+ __asm__ __volatile__
127+ (
128+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0
129+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
130+
131+ "vmovups (%4,%0,8), %%ymm0 \n\t"
132+ "vmovups (%5,%0,8), %%ymm1 \n\t"
133+
134+ "vbroadcastsd (%6), %%ymm6 \n\t" // alpha
135+
136+ "addq $4 , %0 \n\t"
137+ "subq $4 , %1 \n\t"
138+ "jz 2f \n\t"
139+
140+ "1: \n\t"
141+
142+ "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
143+ "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t"
144+ "vmovups (%4,%0,8), %%ymm0 \n\t"
145+ "vmovups (%5,%0,8), %%ymm1 \n\t"
146+
147+ "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y
148+ "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t"
149+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
150+
151+ "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y
152+
153+ "addq $4 , %0 \n\t"
154+ "subq $4 , %1 \n\t"
155+ "jnz 1b \n\t"
156+
157+
158+ "2: \n\t"
159+
160+ "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
161+ "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t"
162+
163+
164+ "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y
165+ "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t"
166+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
167+
168+ "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y
169+
170+
171+ "vzeroupper \n\t"
172+
173+
174+ :
175+ :
176+ "r" (i ), // 0
177+ "r" (n ), // 1
178+ "r" (x ), // 2
179+ "r" (y ), // 3
180+ "r" (ap [0 ]), // 4
181+ "r" (ap [1 ]), // 5
182+ "r" (alpha ) // 6
183+ : "cc" ,
184+ "%xmm0" , "%xmm1" ,
185+ "%xmm4" , "%xmm5" ,
186+ "%xmm6" ,
187+ "%xmm8" ,
188+ "%xmm12" , "%xmm13" ,
189+ "memory"
190+ );
191+ }
0 commit comments