@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626*****************************************************************************/
2727
2828/**************************************************************************************
29- * 2016/04/03 Werner Saar (wernsaar@googlemail.com)
29+ * 2016/04/04 Werner Saar (wernsaar@googlemail.com)
3030* BLASTEST : OK
3131* CTEST : OK
3232* TEST : OK
@@ -38,6 +38,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3838
3939CGEMM_L4_BEGIN:
4040
41+ mr BO, B
42+ mr BBO, BBUFFER
43+ slwi T1, K, 3
44+
45+ CGEMM_L4_COPYB:
46+ dcbtst BBO, PRE
47+
48+ lxvw4x vs3, o0, BO
49+ lxvw4x vs11, o16, BO
50+ xxspltw vs4, vs3, 0
51+ xxspltw vs5, vs3, 1
52+ xxspltw vs6, vs3, 2
53+ xxspltw vs7, vs3, 3
54+ xxspltw vs12, vs11, 0
55+ xxspltw vs13, vs11, 1
56+ xxspltw vs14, vs11, 2
57+ xxspltw vs15, vs11, 3
58+ stxvw4x vs4, o0, BBO
59+ stxvw4x vs5, o16, BBO
60+ stxvw4x vs6, o32, BBO
61+ stxvw4x vs7, o48, BBO
62+ addi BO, BO, 32
63+ addi BBO, BBO, 64
64+ stxvw4x vs12, o0, BBO
65+ stxvw4x vs13, o16, BBO
66+ stxvw4x vs14, o32, BBO
67+ stxvw4x vs15, o48, BBO
68+ addic. T1, T1, -8
69+ addi BBO, BBO, 64
70+
71+ bge CGEMM_L4_COPYB
72+
73+
4174 mr CO, C
4275 mr AO, A
4376 slwi T1, LDC , 2
@@ -48,7 +81,7 @@ CGEMM_L4_BEGIN:
4881CGEMM_L4x8_BEGIN:
4982
5083
51- mr BO, B
84+ mr BO, BBUFFER
5285 srawi. L, K, 3
5386 ble CGEMM_L4x8_SUB0
5487 cmpwi cr0, L, 1
@@ -59,18 +92,25 @@ CGEMM_L4x8_LOOP_START:
5992 dcbt AO, PRE
6093 dcbt BO, PRE
6194 LOAD4x8_1
95+ dcbt BO, PRE
6296 KERNEL4x8_I1
97+ dcbt BO, PRE
6398 dcbt AO, PRE
6499 KERNEL4x8_2
100+ dcbt BO, PRE
65101 KERNEL4x8_1
102+ dcbt BO, PRE
66103 dcbt AO, PRE
67104 KERNEL4x8_2
68105
106+ dcbt BO, PRE
69107 KERNEL4x8_1
70- dcbt AO, PRE
71108 dcbt BO, PRE
109+ dcbt AO, PRE
72110 KERNEL4x8_2
111+ dcbt BO, PRE
73112 KERNEL4x8_1
113+ dcbt BO, PRE
74114 dcbt AO, PRE
75115 KERNEL4x8_2
76116
@@ -81,18 +121,25 @@ CGEMM_L4x8_LOOP_START:
81121
82122CGEMM_L4x8_LOOP:
83123
124+ dcbt BO, PRE
84125 KERNEL4x8_1
126+ dcbt BO, PRE
85127 dcbt AO, PRE
86128 KERNEL4x8_2
129+ dcbt BO, PRE
87130 KERNEL4x8_1
131+ dcbt BO, PRE
88132 dcbt AO, PRE
89133 KERNEL4x8_2
90134
135+ dcbt BO, PRE
91136 KERNEL4x8_1
92- dcbt AO, PRE
93137 dcbt BO, PRE
138+ dcbt AO, PRE
94139 KERNEL4x8_2
140+ dcbt BO, PRE
95141 KERNEL4x8_1
142+ dcbt BO, PRE
96143 dcbt AO, PRE
97144 KERNEL4x8_2
98145
@@ -101,7 +148,9 @@ CGEMM_L4x8_LOOP:
101148
102149CGEMM_L4x8_LOOP_END:
103150
151+ dcbt BO, PRE
104152 KERNEL4x8_1
153+ dcbt BO, PRE
105154 dcbt AO, PRE
106155 KERNEL4x8_2
107156 KERNEL4x8_1
@@ -168,7 +217,7 @@ CGEMM_L4x4_BEGIN:
168217
169218 andi. T1, M, 4
170219 ble CGEMM_L4x4_END
171- mr BO, B
220+ mr BO, BBUFFER
172221 srawi. L, K, 3
173222 ble CGEMM_L4x4_SUB0
174223 cmpwi cr0, L, 1
@@ -268,7 +317,7 @@ CGEMM_L4x2_BEGIN:
268317
269318 andi. T1, M, 2
270319 ble CGEMM_L4x2_END
271- mr BO, B
320+ mr BO, BBUFFER
272321 srawi. L, K, 3
273322 ble CGEMM_L4x2_SUB0
274323 cmpwi cr0, L, 1
@@ -368,7 +417,7 @@ CGEMM_L4x1_BEGIN:
368417
369418 andi. T1, M, 1
370419 ble CGEMM_L4x1_END
371- mr BO, B
420+ mr BO, BBUFFER
372421 srawi. L, K, 3
373422 ble CGEMM_L4x1_SUB0
374423 cmpwi cr0, L, 1
@@ -482,6 +531,39 @@ L999_H1:
482531
483532CGEMM_L2_BEGIN:
484533
534+ mr BO, B
535+ mr BBO, BBUFFER
536+ slwi T1, K, 2
537+
538+ CGEMM_L2_COPYB:
539+ dcbtst BBO, PRE
540+
541+ lxvw4x vs3, o0, BO
542+ lxvw4x vs11, o16, BO
543+ xxspltw vs4, vs3, 0
544+ xxspltw vs5, vs3, 1
545+ xxspltw vs6, vs3, 2
546+ xxspltw vs7, vs3, 3
547+ xxspltw vs12, vs11, 0
548+ xxspltw vs13, vs11, 1
549+ xxspltw vs14, vs11, 2
550+ xxspltw vs15, vs11, 3
551+ stxvw4x vs4, o0, BBO
552+ stxvw4x vs5, o16, BBO
553+ stxvw4x vs6, o32, BBO
554+ stxvw4x vs7, o48, BBO
555+ addi BO, BO, 32
556+ addi BBO, BBO, 64
557+ stxvw4x vs12, o0, BBO
558+ stxvw4x vs13, o16, BBO
559+ stxvw4x vs14, o32, BBO
560+ stxvw4x vs15, o48, BBO
561+ addic. T1, T1, -8
562+ addi BBO, BBO, 64
563+
564+ bge CGEMM_L2_COPYB
565+
566+
485567 andi. T1, N, 2
486568 ble CGEMM_L2_END
487569 mr CO, C
@@ -494,7 +576,7 @@ CGEMM_L2_BEGIN:
494576CGEMM_L2x8_BEGIN:
495577
496578
497- mr BO, B
579+ mr BO, BBUFFER
498580 srawi. L, K, 3
499581 ble CGEMM_L2x8_SUB0
500582 cmpwi cr0, L, 1
@@ -611,7 +693,7 @@ CGEMM_L2x4_BEGIN:
611693
612694 andi. T1, M, 4
613695 ble CGEMM_L2x4_END
614- mr BO, B
696+ mr BO, BBUFFER
615697 srawi. L, K, 3
616698 ble CGEMM_L2x4_SUB0
617699 cmpwi cr0, L, 1
@@ -711,7 +793,7 @@ CGEMM_L2x2_BEGIN:
711793
712794 andi. T1, M, 2
713795 ble CGEMM_L2x2_END
714- mr BO, B
796+ mr BO, BBUFFER
715797 srawi. L, K, 3
716798 ble CGEMM_L2x2_SUB0
717799 cmpwi cr0, L, 1
@@ -811,7 +893,7 @@ CGEMM_L2x1_BEGIN:
811893
812894 andi. T1, M, 1
813895 ble CGEMM_L2x1_END
814- mr BO, B
896+ mr BO, BBUFFER
815897 srawi. L, K, 3
816898 ble CGEMM_L2x1_SUB0
817899 cmpwi cr0, L, 1
@@ -919,6 +1001,39 @@ L999_H2:
9191001
9201002CGEMM_L1_BEGIN:
9211003
1004+ mr BO, B
1005+ mr BBO, BBUFFER
1006+ slwi T1, K, 1
1007+
1008+ CGEMM_L1_COPYB:
1009+ dcbtst BBO, PRE
1010+
1011+ lxvw4x vs3, o0, BO
1012+ lxvw4x vs11, o16, BO
1013+ xxspltw vs4, vs3, 0
1014+ xxspltw vs5, vs3, 1
1015+ xxspltw vs6, vs3, 2
1016+ xxspltw vs7, vs3, 3
1017+ xxspltw vs12, vs11, 0
1018+ xxspltw vs13, vs11, 1
1019+ xxspltw vs14, vs11, 2
1020+ xxspltw vs15, vs11, 3
1021+ stxvw4x vs4, o0, BBO
1022+ stxvw4x vs5, o16, BBO
1023+ stxvw4x vs6, o32, BBO
1024+ stxvw4x vs7, o48, BBO
1025+ addi BO, BO, 32
1026+ addi BBO, BBO, 64
1027+ stxvw4x vs12, o0, BBO
1028+ stxvw4x vs13, o16, BBO
1029+ stxvw4x vs14, o32, BBO
1030+ stxvw4x vs15, o48, BBO
1031+ addic. T1, T1, -8
1032+ addi BBO, BBO, 64
1033+
1034+ bge CGEMM_L1_COPYB
1035+
1036+
9221037 andi. T1, N, 1
9231038 ble CGEMM_L1_END
9241039 mr CO, C
@@ -929,7 +1044,7 @@ CGEMM_L1_BEGIN:
9291044CGEMM_L1x8_BEGIN:
9301045
9311046
932- mr BO, B
1047+ mr BO, BBUFFER
9331048 srawi. L, K, 3
9341049 ble CGEMM_L1x8_SUB0
9351050 cmpwi cr0, L, 1
@@ -1046,7 +1161,7 @@ CGEMM_L1x4_BEGIN:
10461161
10471162 andi. T1, M, 4
10481163 ble CGEMM_L1x4_END
1049- mr BO, B
1164+ mr BO, BBUFFER
10501165 srawi. L, K, 3
10511166 ble CGEMM_L1x4_SUB0
10521167 cmpwi cr0, L, 1
@@ -1146,7 +1261,7 @@ CGEMM_L1x2_BEGIN:
11461261
11471262 andi. T1, M, 2
11481263 ble CGEMM_L1x2_END
1149- mr BO, B
1264+ mr BO, BBUFFER
11501265 srawi. L, K, 3
11511266 ble CGEMM_L1x2_SUB0
11521267 cmpwi cr0, L, 1
@@ -1246,7 +1361,7 @@ CGEMM_L1x1_BEGIN:
12461361
12471362 andi. T1, M, 1
12481363 ble CGEMM_L1x1_END
1249- mr BO, B
1364+ mr BO, BBUFFER
12501365 srawi. L, K, 3
12511366 ble CGEMM_L1x1_SUB0
12521367 cmpwi cr0, L, 1
0 commit comments