@@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
2828
2929#include "common.h"
30- #define Z13_D 1
30+
3131#define PREFETCH_INS 1
3232#if defined(Z13_A )
3333#include <vecintrin.h>
3434
35- static void daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * alpha )
35+ static void daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT alpha )
3636{
3737 BLASLONG i = 0 ;
38- __vector double v_a = {* alpha ,* alpha };
38+ __vector double v_a = {alpha ,alpha };
3939 __vector double * v_y = (__vector double * )y ;
4040 __vector double * v_x = (__vector double * )x ;
4141
@@ -60,256 +60,53 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
6060 }
6161
6262}
63- #elif defined(Z13_B )
64- static void __attribute__ ((noinline )) daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * alpha )
65- {
66-
67-
68- __asm__ volatile (
69- #if defined(PREFETCH_INS )
70- "pfd 1, 0(%1) \n\t"
71- "pfd 2, 0(%2) \n\t"
72- #endif
73- "vlrepg %%v0 , 0(%3) \n\t"
74- "srlg %3,%0,5 \n\t"
75- "xgr %%r1,%%r1 \n\t"
76- "vlr %%v1,%%v0 \n\t"
77- ".align 16 \n\t"
78- "1: \n\t"
79- #if defined(PREFETCH_INS )
80- "pfd 1, 256(%%r1,%1) \n\t"
81- "pfd 2, 256(%%r1,%2) \n\t"
82- #endif
83-
84- "vl %%v24, 0(%%r1,%2) \n\t"
85- "vl %%v16, 0(%%r1,%1) \n\t"
86- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
87- "vst %%v16, 0(%%r1,%2) \n\t"
88- "vl %%v25, 16(%%r1,%2) \n\t"
89- "vl %%v17, 16(%%r1,%1) \n\t"
90- "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
91- "vst %%v17, 16(%%r1,%2) \n\t"
92- "vl %%v26, 32(%%r1,%2) \n\t"
93- "vl %%v18, 32(%%r1,%1) \n\t"
94- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
95- "vst %%v18, 32(%%r1,%2) \n\t"
96- "vl %%v27, 48(%%r1,%2) \n\t"
97- "vl %%v19, 48(%%r1,%1) \n\t"
98- "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
99- "vst %%v19, 48(%%r1,%2) \n\t"
100-
101- "vl %%v24,( 0+64)(%%r1,%2) \n\t"
102- "vl %%v16,( 0+64)(%%r1,%1) \n\t"
103- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
104- "vst %%v16,( 0+64)(%%r1,%2) \n\t"
105- "vl %%v25, (16+64)(%%r1,%2) \n\t"
106- "vl %%v17, (16+64)(%%r1,%1) \n\t"
107- "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
108- "vst %%v17, (16+64)(%%r1,%2) \n\t"
109- "vl %%v26, (32+64)(%%r1,%2) \n\t"
110- "vl %%v18, (32+64)(%%r1,%1) \n\t"
111- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
112- "vst %%v18, (32+64)(%%r1,%2) \n\t"
113- "vl %%v27, (48+64)(%%r1,%2) \n\t"
114- "vl %%v19, (48+64)(%%r1,%1) \n\t"
115- "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
116- "vst %%v19, (48+64)(%%r1,%2) \n\t"
117-
118- "vl %%v24,( 0+128)(%%r1,%2) \n\t"
119- "vl %%v16,( 0+128)(%%r1,%1) \n\t"
120- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
121- "vst %%v16,( 0+128)(%%r1,%2) \n\t"
122- "vl %%v25, (16+128)(%%r1,%2) \n\t"
123- "vl %%v17, (16+128)(%%r1,%1) \n\t"
124- "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
125- "vst %%v17, (16+128)(%%r1,%2) \n\t"
126- "vl %%v26, (32+128)(%%r1,%2) \n\t"
127- "vl %%v18, (32+128)(%%r1,%1) \n\t"
128- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
129- "vst %%v18, (32+128)(%%r1,%2) \n\t"
130- "vl %%v27, (48+128)(%%r1,%2) \n\t"
131- "vl %%v19, (48+128)(%%r1,%1) \n\t"
132- "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
133- "vst %%v19, (48+128)(%%r1,%2) \n\t"
134-
135- "vl %%v24,( 0+192)(%%r1,%2) \n\t"
136- "vl %%v16,( 0+192)(%%r1,%1) \n\t"
137- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
138- "vst %%v16,( 0+192)(%%r1,%2) \n\t"
139- "vl %%v25, (16+192)(%%r1,%2) \n\t"
140- "vl %%v17, (16+192)(%%r1,%1) \n\t"
141- "vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
142- "vst %%v17, (16+192)(%%r1,%2) \n\t"
143- "vl %%v26, (32+192)(%%r1,%2) \n\t"
144- "vl %%v18, (32+192)(%%r1,%1) \n\t"
145- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
146- "vst %%v18, (32+192)(%%r1,%2) \n\t"
147- "vl %%v27, (48+192)(%%r1,%2) \n\t"
148- "vl %%v19, (48+192)(%%r1,%1) \n\t"
149- "vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
150- "vst %%v19, (48+192)(%%r1,%2) \n\t"
151-
152-
153- "la %%r1,256(%%r1) \n\t"
154- "brctg %3,1b"
155- :
156- :"r "(n ),"a" (x ),"a" (y ),"a" (alpha )
157- :"cc ", "memory ", "r1 " ,"v0 " ,"v16 ","v17 ","v18 ","v19 ", "v24 ","v25 ","v26 ","v27 "
158- );
159- }
160-
161- #elif defined(Z13_C )
162- static void __attribute__ ((noinline )) daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * alpha )
163- {
164-
165- __asm__ volatile (
166- #if defined(PREFETCH_INS )
167- "pfd 1, 0(%1) \n\t"
168- "pfd 2, 0(%2) \n\t"
169- #endif
170- "vlrepg %%v0 , 0(%3) \n\t"
171- "srlg %3,%0,5 \n\t"
172- "xgr %%r1,%%r1 \n\t"
173- "vlr %%v1,%%v0 \n\t"
174- ".align 16 \n\t"
175- "1: \n\t"
176- #if defined(PREFETCH_INS )
177- "pfd 1, 256(%%r1,%1) \n\t"
178- "pfd 2, 256(%%r1,%2) \n\t"
179- #endif
180- "vl %%v16, 0(%%r1,%1) \n\t"
181- "vl %%v17, 16(%%r1,%1) \n\t"
182- "vl %%v18, 32(%%r1,%1) \n\t"
183- "vl %%v19, 48(%%r1,%1) \n\t"
184-
185- "vl %%v24, 0(%%r1,%2) \n\t"
186- "vl %%v25, 16(%%r1,%2) \n\t"
187- "vl %%v26, 32(%%r1,%2) \n\t"
188- "vl %%v27, 48(%%r1,%2) \n\t"
189- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
190- "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
191- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
192- "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
193- "vst %%v16, 0(%%r1,%2) \n\t"
194- "vst %%v17, 16(%%r1,%2) \n\t"
195- "vst %%v18, 32(%%r1,%2) \n\t"
196- "vst %%v19, 48(%%r1,%2) \n\t"
197-
198- "vl %%v24, 64(%%r1,%1) \n\t"
199- "vl %%v25, 80(%%r1,%1) \n\t"
200- "vl %%v26, 96(%%r1,%1) \n\t"
201- "vl %%v27, 112(%%r1,%1) \n\t"
202-
203- "vl %%v16, 64(%%r1,%2) \n\t"
204- "vl %%v17, 80(%%r1,%2) \n\t"
205- "vl %%v18, 96(%%r1,%2) \n\t"
206- "vl %%v19, 112(%%r1,%2) \n\t"
207-
208-
209- "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
210- "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
211- "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
212- "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
213-
214- "vst %%v24, 64(%%r1,%2) \n\t"
215- "vst %%v25, 80(%%r1,%2) \n\t"
216- "vst %%v26, 96(%%r1,%2) \n\t"
217- "vst %%v27, 112(%%r1,%2) \n\t"
218-
219- "vl %%v16, (0+128)(%%r1,%1) \n\t"
220- "vl %%v17, (16+128)(%%r1,%1) \n\t"
221- "vl %%v18, (32+128)(%%r1,%1) \n\t"
222- "vl %%v19, (48+128)(%%r1,%1) \n\t"
223-
224- "vl %%v24, (0+128)(%%r1,%2) \n\t"
225- "vl %%v25, (16+128)(%%r1,%2) \n\t"
226- "vl %%v26, (32+128)(%%r1,%2) \n\t"
227- "vl %%v27, (48+128)(%%r1,%2) \n\t"
228-
229- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
230- "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
231- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
232- "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
233- "vst %%v16, (0+128)(%%r1,%2) \n\t"
234- "vst %%v17, (16+128)(%%r1,%2) \n\t"
235- "vst %%v18, (32+128)(%%r1,%2) \n\t"
236- "vst %%v19, (48+128)(%%r1,%2) \n\t"
237-
238- "vl %%v24, (64+128)(%%r1,%1) \n\t"
239- "vl %%v25, (80+128)(%%r1,%1) \n\t"
240- "vl %%v26, (96+128)(%%r1,%1) \n\t"
241- "vl %%v27, (112+128)(%%r1,%1) \n\t"
242-
243- "vl %%v16, (64+128)(%%r1,%2) \n\t"
244- "vl %%v17, (80+128)(%%r1,%2) \n\t"
245- "vl %%v18, (96+128)(%%r1,%2) \n\t"
246- "vl %%v19, (112+128)(%%r1,%2) \n\t"
247-
248- "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
249- "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
250- "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
251- "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
252-
253- "vst %%v24, (64+128)(%%r1,%2) \n\t"
254- "vst %%v25, (80+128)(%%r1,%2) \n\t"
255- "vst %%v26, (96+128)(%%r1,%2) \n\t"
256- "vst %%v27, (112+128)(%%r1,%2) \n\t"
257-
258- "la %%r1,256(%%r1) \n\t"
259- "brctg %3,1b"
260- :
261- :"r "(n ),"a" (x ),"a" (y ),"a" (alpha )
262- :"cc ", "memory ", "r1 " ,"v0 ","v1 ","v16 ","v17 ","v18 ","v19 ", "v24 ","v25 ","v26 ","v27 "
263- );
264- }
265-
266-
267- #elif defined(Z13_D )
268- static void __attribute__ ((noinline )) daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * alpha )
63+ #else
64+ static void daxpy_kernel_32 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT alpha )
26965{
27066
27167 __asm__ volatile (
27268#if defined(PREFETCH_INS )
273- "pfd 1, 0(%1) \n\t"
274- "pfd 2, 0(%2) \n\t"
275- #endif
276- "vlrepg %%v0 , 0(%3) \n\t"
277- "srlg %3,%0,5 \n\t"
278- "vlr %%v1,%%v0 \n\t"
69+ "pfd 1, 0(%[x_tmp]) \n\t"
70+ "pfd 2, 0(%[y_tmp]) \n\t"
71+ #endif
72+ "lgdr %%r0,%[alpha] \n\t"
73+ "vlvgp %%v0,%%r0,%%r0 \n\t"
74+ "srlg %%r0,%[n],5 \n\t"
75+ "vlr %%v1,%%v0 \n\t"
27976 ".align 16 \n\t"
28077 "1: \n\t"
28178#if defined(PREFETCH_INS )
282- "pfd 1, 256(%1 ) \n\t"
283- "pfd 2, 256(%2 ) \n\t"
79+ "pfd 1, 256(%[x_tmp] ) \n\t"
80+ "pfd 2, 256(%[y_tmp] ) \n\t"
28481#endif
285- "vlm %%v16,%%v23, 0(%1) \n\t"
286- "vlm %%v24, %%v31, 0(%2) \n\t"
287- "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
288- "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
289- "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
290- "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
291- "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
292- "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
293- "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
294- "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
295- "vstm %%v16,%%v23, 0(%2) \n\t"
296- "vlm %%v24,%%v31, 128(%1) \n\t"
297- "vlm %%v16,%%v23, 128(%2 ) \n\t"
298- "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
299- "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
300- "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
301- "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
302- "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
303- "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
304- "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
305- "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
306- "la %1,256(%1) \n\t"
307- "vstm %%v24, %%v31, 128(%2) \n\t"
308- "la %2,256(%2) \n\t"
309- "brctg %3 ,1b"
310- :
311- :" r "( n ), "a" ( x ),"a" ( y ), "a " (alpha )
312- :"cc" , "memory" , "v0" ,"v1" ,"v16" ,"v17" ,"v18" ,"v19" ,"v20" ,"v21" ,
82+ "vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
83+ "vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
84+ "vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
85+ "vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
86+ "vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
87+ "vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
88+ "vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
89+ "vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
90+ "vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
91+ "vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
92+ "vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
93+ "vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
94+ "vlm %%v16,%%v23, 128(%[y_tmp] ) \n\t"
95+ "vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
96+ "vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
97+ "vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
98+ "vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
99+ "vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
100+ "vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
101+ "vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
102+ "vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
103+ "la %[x_tmp],256(%[x_tmp]) \n\t"
104+ "vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
105+ "la %[y_tmp],256(%[y_tmp]) \n\t"
106+ "brctg %%r0 ,1b"
107+ : [ mem_y ] " + m " (*(double (*)[n])y), [x_tmp] " + & a "(x), [y_tmp] " + & a "(y)
108+ : [ mem_x ] "m" ( * ( const double ( * )[ n ]) x ), [ n ] " r "(n), [alpha] " f "(alpha)
109+ :" cc ", " r0 ", "v0 "," v1 "," v16 "," v17 "," v18 "," v19 "," v20 "," v21 ",
313110 " v22 "," v23 "," v24 "," v25 "," v26 "," v27 "," v28 "," v29 "," v30 "," v31 "
314111 );
315112
@@ -334,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
334131 BLASLONG n1 = n & -32 ;
335132
336133 if ( n1 )
337- daxpy_kernel_32 (n1 , x , y , & da );
134+ daxpy_kernel_32 (n1 , x , y , da );
338135
339136 i = n1 ;
340137 while (i < n )
0 commit comments