Skip to content

Commit aae581d

Browse files
committed
Merge branch 'develop' into release-0.2.16
2 parents aa90518 + e173039 commit aae581d

101 files changed

Lines changed: 38024 additions & 374 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ before_install:
2525
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
2626

2727
script:
28+
- set -e
2829
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
2930
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
3031
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi

CONTRIBUTORS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,5 +141,11 @@ In chronological order:
141141
* Martin Koehler <https://github.com/grisuthedragon/>
142142
* [2015-09-07] Improved imatcopy
143143

144+
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
145+
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
146+
* [2015-11-20] lapack-test fixes for Cortex-A57
147+
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
148+
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
149+
144150
* [Your name or handle] <[email or website]>
145151
* [Date] [Brief summary of your changes]

Makefile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,20 +83,20 @@ shared :
8383
ifndef NO_SHARED
8484
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
8585
@$(MAKE) -C exports so
86-
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
87-
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
86+
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
87+
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
8888
endif
8989
ifeq ($(OSNAME), FreeBSD)
9090
@$(MAKE) -C exports so
91-
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
91+
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
9292
endif
9393
ifeq ($(OSNAME), NetBSD)
9494
@$(MAKE) -C exports so
95-
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
95+
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
9696
endif
9797
ifeq ($(OSNAME), Darwin)
9898
@$(MAKE) -C exports dyn
99-
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
99+
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
100100
endif
101101
ifeq ($(OSNAME), WINNT)
102102
@$(MAKE) -C exports dll

USAGE.md

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
# Notes on OpenBLAS usage
2+
## Usage
3+
4+
#### Program is Terminated. Because you tried to allocate too many memory regions
5+
6+
In OpenBLAS, we mange a pool of memory buffers and allocate the number of
7+
buffers as the following.
8+
```
9+
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
10+
```
11+
This error indicates that the program exceeded the number of buffers.
12+
13+
Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
14+
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
15+
`MAX_CPU_NUMBER=NUM_THREADS`.
16+
17+
#### How can I use OpenBLAS in multi-threaded applications?
18+
19+
If your application is already multi-threaded, it will conflict with OpenBLAS
20+
multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
21+
following ways:
22+
23+
* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
24+
* Call `openblas_set_num_threads(1)` in the application on runtime.
25+
* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
26+
27+
If the application is parallelized by OpenMP, please use OpenBLAS built with
28+
`USE_OPENMP=1`
29+
30+
#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
31+
32+
The environment variable which control the kernel selection is
33+
`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
34+
OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
35+
returns the used target.
36+
37+
#### How could I disable OpenBLAS threading affinity on runtime?
38+
39+
You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
40+
variable to disable threading affinity on runtime. For example, before the
41+
running,
42+
```
43+
export OPENBLAS_MAIN_FREE=1
44+
```
45+
46+
Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
47+
in `Makefile.rule`.
48+
49+
## Linking with the library
50+
51+
* Link with shared library
52+
53+
`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
54+
55+
If the library is multithreaded, please add `-lpthread`. If the library
56+
contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
57+
58+
* Link with static library
59+
60+
`gcc -o test test.c /your/path/libopenblas.a`
61+
62+
You can download `test.c` from https://gist.github.com/xianyi/5780018
63+
64+
On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
65+
default), custom programs statically linked against `libopenblas.a` should also
66+
link with the pthread library e.g.:
67+
68+
```
69+
gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
70+
```
71+
72+
Failing to add the `-lpthread` flag will cause errors such as:
73+
74+
```
75+
/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
76+
memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
77+
memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
78+
...
79+
```
80+
81+
## Code examples
82+
83+
#### Call CBLAS interface
84+
This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
85+
```
86+
#include <cblas.h>
87+
#include <stdio.h>
88+
89+
void main()
90+
{
91+
int i=0;
92+
double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
93+
double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
94+
double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
95+
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
96+
97+
for(i=0; i<9; i++)
98+
printf("%lf ", C[i]);
99+
printf("\n");
100+
}
101+
```
102+
`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
103+
104+
#### Call BLAS Fortran interface
105+
106+
This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
107+
108+
```
109+
#include "stdio.h"
110+
#include "stdlib.h"
111+
#include "sys/time.h"
112+
#include "time.h"
113+
114+
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
115+
116+
int main(int argc, char* argv[])
117+
{
118+
int i;
119+
printf("test!\n");
120+
if(argc<4){
121+
printf("Input Error\n");
122+
return 1;
123+
}
124+
125+
int m = atoi(argv[1]);
126+
int n = atoi(argv[2]);
127+
int k = atoi(argv[3]);
128+
int sizeofa = m * k;
129+
int sizeofb = k * n;
130+
int sizeofc = m * n;
131+
char ta = 'N';
132+
char tb = 'N';
133+
double alpha = 1.2;
134+
double beta = 0.001;
135+
136+
struct timeval start,finish;
137+
double duration;
138+
139+
double* A = (double*)malloc(sizeof(double) * sizeofa);
140+
double* B = (double*)malloc(sizeof(double) * sizeofb);
141+
double* C = (double*)malloc(sizeof(double) * sizeofc);
142+
143+
srand((unsigned)time(NULL));
144+
145+
for (i=0; i<sizeofa; i++)
146+
A[i] = i%3+1;//(rand()%100)/10.0;
147+
148+
for (i=0; i<sizeofb; i++)
149+
B[i] = i%3+1;//(rand()%100)/10.0;
150+
151+
for (i=0; i<sizeofc; i++)
152+
C[i] = i%3+1;//(rand()%100)/10.0;
153+
//#if 0
154+
printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
155+
gettimeofday(&start, NULL);
156+
dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
157+
gettimeofday(&finish, NULL);
158+
159+
duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
160+
double gflops = 2.0 * m *n*k;
161+
gflops = gflops/duration*1.0e-6;
162+
163+
FILE *fp;
164+
fp = fopen("timeDGEMM.txt", "a");
165+
fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
166+
fclose(fp);
167+
168+
free(A);
169+
free(B);
170+
free(C);
171+
return 0;
172+
}
173+
```
174+
175+
` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
176+
177+
` ./time_dgemm <m> <n> <k> `
178+
179+
## Troubleshooting
180+
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
181+
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
182+
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
183+
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
184+
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
185+
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
186+
187+
## BLAS reference manual
188+
If you want to understand every BLAS function and definition, please read
189+
[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
190+
or [netlib.org](http://netlib.org/blas/)
191+
192+
Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
193+
194+
## How to reference OpenBLAS.
195+
196+
You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
197+
198+
Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.
199+

benchmark/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2134,7 +2134,7 @@ zgemm3m.$(SUFFIX) : gemm3m.c
21342134
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
21352135

21362136
smallscaling: smallscaling.c ../$(LIBNAME)
2137-
$(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^
2137+
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
21382138

21392139
clean ::
21402140
@rm -f *.goto *.mkl *.acml *.atlas *.veclib

benchmark/smallscaling.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,28 +23,32 @@ typedef struct {
2323

2424
void * s_create_matrix(int size) {
2525
float * r = malloc(size * sizeof(double));
26-
for(int i = 0; i < size; i++)
26+
int i;
27+
for(i = 0; i < size; i++)
2728
r[i] = 1e3 * i / size;
2829
return r;
2930
}
3031

3132
void * c_create_matrix(int size) {
3233
float * r = malloc(size * 2 * sizeof(double));
33-
for(int i = 0; i < 2 * size; i++)
34+
int i;
35+
for(i = 0; i < 2 * size; i++)
3436
r[i] = 1e3 * i / size;
3537
return r;
3638
}
3739

3840
void * z_create_matrix(int size) {
3941
double * r = malloc(size * 2 * sizeof(double));
40-
for(int i = 0; i < 2 * size; i++)
42+
int i;
43+
for(i = 0; i < 2 * size; i++)
4144
r[i] = 1e3 * i / size;
4245
return r;
4346
}
4447

4548
void * d_create_matrix(int size) {
4649
double * r = malloc(size * sizeof(double));
47-
for(int i = 0; i < size; i++)
50+
int i;
51+
for(i = 0; i < size; i++)
4852
r[i] = 1e3 * i / size;
4953
return r;
5054
}
@@ -188,4 +192,5 @@ int main(int argc, char * argv[]) {
188192
size *= inc_factor;
189193
}
190194
}
195+
return(0);
191196
}

common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,12 +332,13 @@ typedef int blasint;
332332
#endif
333333
#endif
334334

335-
335+
/*
336336
#ifdef PILEDRIVER
337337
#ifndef YIELDING
338338
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
339339
#endif
340340
#endif
341+
*/
341342

342343
/*
343344
#ifdef STEAMROLLER

common_power.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
236236
#define HAVE_PREFETCH
237237
#endif
238238

239-
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
239+
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
240240
#define DCBT_ARG 0
241241
#else
242242
#define DCBT_ARG 8
@@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
258258
#define L1_PREFETCH dcbtst
259259
#endif
260260

261+
#if defined(POWER8)
262+
#define L1_DUALFETCH
263+
#define L1_PREFETCHSIZE (16 + 128 * 100)
264+
#define L1_PREFETCH dcbtst
265+
#endif
266+
267+
#
261268
#ifndef L1_PREFETCH
262269
#define L1_PREFETCH dcbt
263270
#endif
@@ -790,6 +797,8 @@ Lmcount$lazy_ptr:
790797
#define BUFFER_SIZE ( 2 << 20)
791798
#elif defined(PPC440FP2)
792799
#define BUFFER_SIZE ( 16 << 20)
800+
#elif defined(POWER8)
801+
#define BUFFER_SIZE ( 64 << 20)
793802
#else
794803
#define BUFFER_SIZE ( 16 << 20)
795804
#endif

common_x86_64.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
396396

397397
#define PROFCODE
398398

399-
#define EPILOGUE .end REALNAME
399+
#define EPILOGUE .end
400400
#endif
401401

402402
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)

cpuid_power.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
#define CPUTYPE_POWER6 5
5656
#define CPUTYPE_CELL 6
5757
#define CPUTYPE_PPCG4 7
58+
#define CPUTYPE_POWER8 8
5859

5960
char *cpuname[] = {
6061
"UNKNOWN",
@@ -65,6 +66,7 @@ char *cpuname[] = {
6566
"POWER6",
6667
"CELL",
6768
"PPCG4",
69+
"POWER8"
6870
};
6971

7072
char *lowercpuname[] = {
@@ -76,6 +78,7 @@ char *lowercpuname[] = {
7678
"power6",
7779
"cell",
7880
"ppcg4",
81+
"power8"
7982
};
8083

8184
char *corename[] = {
@@ -87,6 +90,7 @@ char *corename[] = {
8790
"POWER6",
8891
"CELL",
8992
"PPCG4",
93+
"POWER8"
9094
};
9195

9296
int detect(void){
@@ -115,7 +119,7 @@ int detect(void){
115119
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
116120
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
117121
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
118-
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6;
122+
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
119123
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
120124
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
121125

0 commit comments

Comments
 (0)