Skip to content

Commit 58c90d5

Browse files
benedikt-huberxianyi
authored andcommitted
# The first commit's message is:
Optimizations for APM's xgene-1 (aarch64). 1) general system updates to support armv8 better. Make all did not work, one needed to supply TARGET=ARMV8. 2) sgem 4x4 kernel in assembler using SIMD, and configuration changes to use it. 3) strmm 4x4 kernel in C. Since the sgem kernel does 4x4, the trmm kernel must also do 4xN. Added Dave Nuechterlein to the contributors list.
1 parent 2987bc7 commit 58c90d5

8 files changed

Lines changed: 2442 additions & 16 deletions

File tree

CONTRIBUTORS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,5 +117,9 @@ In chronological order:
117117
* Isaac Dunham <https://github.com/idunham>
118118
* [2014-08-03] Fixed link error on Linux/musl
119119

120+
* Dave Nuechterlein
121+
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
122+
ARMv8 support.
123+
120124
* [Your name or handle] <[email or website]>
121125
* [Date] [Brief summary of your changes]

common_arm64.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){
119119
}
120120

121121
#if defined(DOUBLE)
122-
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
122+
#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
123123
#else
124-
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
124+
#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
125125
#endif
126126

127127
#define GET_IMAGE_CANCEL
@@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
138138
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
139139

140140
#define PROLOGUE \
141-
.arm ;\
142141
.global REALNAME ;\
143142
.func REALNAME ;\
144143
REALNAME:

cpuid_arm64.c

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
/**************************************************************************
2+
Copyright (c) 2013, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#include <string.h>
29+
30+
#define CPU_UNKNOWN 0
31+
#define CPU_ARMV8 1
32+
33+
static char *cpuname[] = {
34+
"UNKOWN",
35+
"ARMV8"
36+
};
37+
38+
39+
int get_feature(char *search)
40+
{
41+
42+
#ifdef linux
43+
FILE *infile;
44+
char buffer[2048], *p,*t;
45+
p = (char *) NULL ;
46+
47+
infile = fopen("/proc/cpuinfo", "r");
48+
49+
while (fgets(buffer, sizeof(buffer), infile))
50+
{
51+
52+
if (!strncmp("Features", buffer, 8))
53+
{
54+
p = strchr(buffer, ':') + 2;
55+
break;
56+
}
57+
}
58+
59+
fclose(infile);
60+
61+
62+
if( p == NULL ) return;
63+
64+
t = strtok(p," ");
65+
while( t = strtok(NULL," "))
66+
{
67+
if (!strcmp(t, search)) { return(1); }
68+
}
69+
70+
#endif
71+
return(0);
72+
}
73+
74+
75+
int detect(void)
76+
{
77+
78+
#ifdef linux
79+
80+
FILE *infile;
81+
char buffer[512], *p;
82+
p = (char *) NULL ;
83+
84+
infile = fopen("/proc/cpuinfo", "r");
85+
86+
while (fgets(buffer, sizeof(buffer), infile))
87+
{
88+
89+
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
90+
{
91+
p = strchr(buffer, ':') + 2;
92+
break;
93+
}
94+
}
95+
96+
fclose(infile);
97+
98+
if(p != NULL)
99+
{
100+
101+
if (strstr(p, "AArch64"))
102+
{
103+
return CPU_ARMV8;
104+
105+
}
106+
107+
108+
}
109+
#endif
110+
111+
return CPU_UNKNOWN;
112+
}
113+
114+
char *get_corename(void)
115+
{
116+
return cpuname[detect()];
117+
}
118+
119+
void get_architecture(void)
120+
{
121+
printf("ARM");
122+
}
123+
124+
void get_subarchitecture(void)
125+
{
126+
int d = detect();
127+
switch (d)
128+
{
129+
130+
case CPU_ARMV8:
131+
printf("ARMV8");
132+
break;
133+
134+
default:
135+
printf("UNKNOWN");
136+
break;
137+
}
138+
}
139+
140+
void get_subdirname(void)
141+
{
142+
printf("arm64");
143+
}
144+
145+
void get_cpuconfig(void)
146+
{
147+
148+
int d = detect();
149+
switch (d)
150+
{
151+
152+
case CPU_ARMV8:
153+
printf("#define ARMV8\n");
154+
printf("#define L1_DATA_SIZE 32768\n");
155+
printf("#define L1_DATA_LINESIZE 64\n");
156+
printf("#define L2_SIZE 262144\n");
157+
printf("#define L2_LINESIZE 64\n");
158+
printf("#define DTB_DEFAULT_ENTRIES 64\n");
159+
printf("#define DTB_SIZE 4096\n");
160+
printf("#define L2_ASSOCIATIVE 4\n");
161+
break;
162+
163+
164+
}
165+
}
166+
167+
168+
void get_libname(void)
169+
{
170+
171+
int d = detect();
172+
switch (d)
173+
{
174+
175+
case CPU_ARMV8:
176+
printf("armv8\n");
177+
break;
178+
179+
}
180+
}
181+
182+
183+
void get_features(void)
184+
{
185+
186+
#ifdef linux
187+
FILE *infile;
188+
char buffer[2048], *p,*t;
189+
p = (char *) NULL ;
190+
191+
infile = fopen("/proc/cpuinfo", "r");
192+
193+
while (fgets(buffer, sizeof(buffer), infile))
194+
{
195+
196+
if (!strncmp("Features", buffer, 8))
197+
{
198+
p = strchr(buffer, ':') + 2;
199+
break;
200+
}
201+
}
202+
203+
fclose(infile);
204+
205+
206+
if( p == NULL ) return;
207+
208+
t = strtok(p," ");
209+
while( t = strtok(NULL," "))
210+
{
211+
}
212+
213+
#endif
214+
return;
215+
}
216+
217+

getarch.c

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
746746
#define SUBARCHITECTURE "ARMV8"
747747
#define SUBDIRNAME "arm64"
748748
#define ARCHCONFIG "-DARMV8 " \
749-
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
750-
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
751-
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
752-
"-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
749+
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
750+
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
751+
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
753752
#define LIBNAME "armv8"
754-
#define CORENAME "ARMV8"
753+
#define CORENAME "XGENE1"
755754
#else
756755
#endif
757756

@@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
801800
#define OPENBLAS_SUPPORTED
802801
#endif
803802

803+
#ifdef __aarch64__
804+
#include "cpuid_arm64.c"
805+
#define OPENBLAS_SUPPORTED
806+
#endif
807+
804808

805809
#ifndef OPENBLAS_SUPPORTED
806810
#error "This arch/CPU is not supported by OpenBLAS."
@@ -856,7 +860,7 @@ int main(int argc, char *argv[]){
856860
#ifdef FORCE
857861
printf("CORE=%s\n", CORENAME);
858862
#else
859-
#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
863+
#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
860864
printf("CORE=%s\n", get_corename());
861865
#endif
862866
#endif
@@ -956,7 +960,7 @@ int main(int argc, char *argv[]){
956960
#ifdef FORCE
957961
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
958962
#else
959-
#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
963+
#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
960964
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
961965
#endif
962966
#endif

kernel/arm64/KERNEL.ARMV8

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c
8080
CGEMVTKERNEL = ../arm/zgemv_t.c
8181
ZGEMVTKERNEL = ../arm/zgemv_t.c
8282

83-
STRMMKERNEL = ../generic/trmmkernel_2x2.c
83+
STRMMKERNEL = ../generic/trmmkernel_4x4.c
8484
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
8585
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
8686
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
8787

88-
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
89-
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
90-
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
88+
SGEMMKERNEL = sgemm_kernel_4x4.S
89+
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
90+
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
9191
SGEMMONCOPYOBJ = sgemm_oncopy.o
9292
SGEMMOTCOPYOBJ = sgemm_otcopy.o
9393

0 commit comments

Comments
 (0)