@@ -30,75 +30,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3030
3131
3232#if defined(Z13 )
33- static void __attribute__ (( noinline )) ddot_kernel_8 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * d )
33+ static FLOAT ddot_kernel_8 (BLASLONG n , FLOAT * x , FLOAT * y )
3434{
35-
36- __asm__ volatile (
37- "pfd 1, 0(%1) \n\t"
38- "pfd 1, 0(%2) \n\t"
39- "vzero %%v24 \n\t"
40- "vzero %%v25 \n\t"
41- "vzero %%v26 \n\t"
42- "vzero %%v27 \n\t"
43- "srlg %%r0,%0,4 \n\t"
44- "xgr %%r1,%%r1 \n\t"
45- ".align 16 \n\t"
46- "1: \n\t"
47- "pfd 1, 256(%%r1,%1) \n\t"
48- "pfd 1, 256(%%r1,%2) \n\t"
49- "vl %%v16, 0(%%r1,%1) \n\t"
50- "vl %%v17, 16(%%r1,%1) \n\t"
51- "vl %%v18, 32(%%r1,%1) \n\t"
52- "vl %%v19, 48(%%r1,%1) \n\t"
53-
54- "vl %%v28, 0(%%r1,%2) \n\t"
55- "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
56- "vl %%v29, 16(%%r1,%2) \n\t"
57- "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
35+ FLOAT dot ;
36+ __asm__ volatile (
37+ "pfd 1, 0(%2) \n\t"
38+ "pfd 1, 0(%3) \n\t"
39+ "vzero %%v24 \n\t"
40+ "vzero %%v25 \n\t"
41+ "vzero %%v26 \n\t"
42+ "vzero %%v27 \n\t"
43+ "srlg %1,%1,4 \n\t"
44+ "xgr %%r1,%%r1 \n\t"
45+ ".align 16 \n\t"
46+ "1: \n\t"
47+ "pfd 1, 256(%%r1,%2) \n\t"
48+ "pfd 1, 256(%%r1,%3) \n\t"
49+ "vl %%v16, 0(%%r1,%2) \n\t"
50+ "vl %%v17, 16(%%r1,%2) \n\t"
51+ "vl %%v18, 32(%%r1,%2) \n\t"
52+ "vl %%v19, 48(%%r1,%2) \n\t"
53+
54+ "vl %%v28, 0(%%r1,%3) \n\t"
55+ "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
56+ "vl %%v29, 16(%%r1,%3) \n\t"
57+ "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
58+
59+ "vl %%v30, 32(%%r1,%3) \n\t"
60+ "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
61+ "vl %%v31, 48(%%r1,%3) \n\t"
62+ "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
63+
64+ "vl %%v16, 64(%%r1,%2) \n\t"
65+ "vl %%v17, 80(%%r1,%2) \n\t"
66+ "vl %%v18, 96(%%r1,%2) \n\t"
67+ "vl %%v19, 112(%%r1,%2) \n\t"
68+
69+ "vl %%v28, 64(%%r1,%3) \n\t"
70+ "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
71+ "vl %%v29, 80(%%r1,%3) \n\t"
72+ "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
73+
74+
75+ "vl %%v30, 96(%%r1,%3) \n\t"
76+ "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
77+ "vl %%v31, 112(%%r1,%3) \n\t"
78+ "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
5879
59- "vl %%v30, 32(%%r1,%2) \n\t"
60- "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
61- "vl %%v31, 48(%%r1,%2) \n\t"
62- "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
63-
64- "vl %%v16, 64(%%r1,%1) \n\t"
65- "vl %%v17, 80(%%r1,%1) \n\t"
66- "vl %%v18, 96(%%r1,%1) \n\t"
67- "vl %%v19, 112(%%r1,%1) \n\t"
68-
69- "vl %%v28, 64(%%r1,%2) \n\t"
70- "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
71- "vl %%v29, 80(%%r1,%2) \n\t"
72- "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
73-
74-
75- "vl %%v30, 96(%%r1,%2) \n\t"
76- "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
77- "vl %%v31, 112(%%r1,%2) \n\t"
78- "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
79-
80-
81- "la %%r1,128(%%r1) \n\t"
82- "brctg %%r0,1b \n\t"
83- "vfadb %%v24,%%v25,%%v24 \n\t"
84- "vfadb %%v24,%%v26,%%v24 \n\t"
85- "vfadb %%v24,%%v27,%%v24 \n\t"
86- "vrepg %%v1,%%v24,1 \n\t"
87- "vfadb %%v1,%%v24,%%v1 \n\t"
88- " std %%f1,0(%3) \n\t"
89- :
90- :"r" (n ),"a" (x ),"a" (y ),"a" (d )
91- :"cc" , "memory" ,"r0" ,"r1" ,"v16" , "v17" ,"v18" ,"v19" ,"v20" ,"v21" ,"v22" ,"v23" ,
92- "v24" ,"v25" ,"v26" ,"v27" ,"v28" ,"v29" ,"v30" ,"v31"
93-
94- );
80+
81+ "la %%r1,128(%%r1) \n\t"
82+ "brctg %1,1b \n\t"
83+ "vfadb %%v24,%%v25,%%v24 \n\t"
84+ "vfadb %%v24,%%v26,%%v24 \n\t"
85+ "vfadb %%v24,%%v27,%%v24 \n\t"
86+ "vrepg %%v1,%%v24,1 \n\t"
87+ "vfadb %%v1,%%v24,%%v1 \n\t"
88+ "ldr %0, %%f1 \n\t"
89+ : "=f" (dot ) ,"+&r" (n )
90+ : "a" (x ),"a" (y )
91+ :"cc" , "r1" ,"v16" , "v17" ,"v18" ,"v19" ,"v20" ,"v21" ,"v22" ,"v23" ,
92+ "v24" ,"v25" ,"v26" ,"v27" ,"v28" ,"v29" ,"v30" ,"v31"
93+
94+ );
95+ return dot ;
9596
9697}
9798
9899
99100#else
100101
101- static void ddot_kernel_8 (BLASLONG n , FLOAT * x , FLOAT * y , FLOAT * d )
102+ static FLOAT ddot_kernel_8 (BLASLONG n , FLOAT * x , FLOAT * y )
102103{
103104 BLASLONG register i = 0 ;
104105 FLOAT dot = 0.0 ;
@@ -117,8 +118,8 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
117118 i += 8 ;
118119
119120 }
120- * d += dot ;
121-
121+ return dot ;
122+
122123}
123124
124125#endif
@@ -136,9 +137,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
136137 {
137138
138139 BLASLONG n1 = n & -16 ;
139-
140+
140141 if ( n1 )
141- ddot_kernel_8 (n1 , x , y , & dot );
142+ dot = ddot_kernel_8 (n1 , x , y );
142143
143144 i = n1 ;
144145 while (i < n )
0 commit comments