Skip to content

Commit d10db52

Browse files
committed
Merge pull request #390 from wernsaar/develop
Ref #103: enhancement for small matrix dimensions. Fixed some bugs. Enable sgemm for SNB and dgemm for NEHALEM
2 parents 8602816 + dabab2b commit d10db52

30 files changed

Lines changed: 3375 additions & 75 deletions

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,13 @@ ifndef BINARY64
3636
else
3737
@echo " BINARY ... 64bit "
3838
endif
39+
3940
ifdef INTERFACE64
41+
ifneq ($(INTERFACE64), 0)
4042
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
4143
endif
44+
endif
45+
4246
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
4347
ifndef NOFORTRAN
4448
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"

Makefile.rule

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,8 @@ NO_AFFINITY = 1
133133
# COMMON_OPT = -O2
134134

135135
# gfortran option for LAPACK
136-
FCOMMON_OPT = -frecursive
136+
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
137+
# FCOMMON_OPT = -frecursive
137138

138139
# Profiling flags
139140
COMMON_PROF = -pg

Makefile.system

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,55 @@ ifdef TARGET
4646
GETARCH_FLAGS := -DFORCE_$(TARGET)
4747
endif
4848

49+
# Force fallbacks for 32bit
50+
51+
ifeq ($(BINARY), 32)
52+
ifeq ($(TARGET), HASWELL)
53+
GETARCH_FLAGS := -DFORCE_NEHALEM
54+
endif
55+
ifeq ($(TARGET), SANDYBRIDGE)
56+
GETARCH_FLAGS := -DFORCE_NEHALEM
57+
endif
58+
ifeq ($(TARGET), BULLDOZER)
59+
GETARCH_FLAGS := -DFORCE_BARCELONA
60+
endif
61+
ifeq ($(TARGET), PILEDRIVER)
62+
GETARCH_FLAGS := -DFORCE_BARCELONA
63+
endif
64+
endif
65+
66+
4967
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
5068
#
5169
ifdef TARGET_CORE
5270
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
5371
endif
5472

73+
# Force fallbacks for 32bit
74+
75+
ifeq ($(BINARY), 32)
76+
ifeq ($(TARGET_CORE), HASWELL)
77+
GETARCH_FLAGS := -DFORCE_NEHALEM
78+
endif
79+
ifeq ($(TARGET_CORE), SANDYBRIDGE)
80+
GETARCH_FLAGS := -DFORCE_NEHALEM
81+
endif
82+
ifeq ($(TARGET_CORE), BULLDOZER)
83+
GETARCH_FLAGS := -DFORCE_BARCELONA
84+
endif
85+
ifeq ($(TARGET_CORE), PILEDRIVER)
86+
GETARCH_FLAGS := -DFORCE_BARCELONA
87+
endif
88+
endif
89+
90+
91+
92+
5593
ifdef INTERFACE64
94+
ifneq ($(INTERFACE64), 0)
5695
GETARCH_FLAGS += -DUSE64BITINT
5796
endif
97+
endif
5898

5999
ifndef GEMM_MULTITHREAD_THRESHOLD
60100
GEMM_MULTITHREAD_THRESHOLD=4
@@ -65,6 +105,10 @@ ifeq ($(NO_AVX), 1)
65105
GETARCH_FLAGS += -DNO_AVX
66106
endif
67107

108+
ifeq ($(BINARY), 32)
109+
GETARCH_FLAGS += -DNO_AVX
110+
endif
111+
68112
ifeq ($(DEBUG), 1)
69113
GETARCH_FLAGS += -g
70114
endif
@@ -336,9 +380,6 @@ ifeq ($(DYNAMIC_ARCH), 1)
336380
ifeq ($(ARCH), x86)
337381
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
338382
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
339-
ifneq ($(NO_AVX), 1)
340-
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
341-
endif
342383
endif
343384

344385
ifeq ($(ARCH), x86_64)
@@ -503,8 +544,10 @@ else
503544
ifdef BINARY64
504545
FCOMMON_OPT += -m64
505546
ifdef INTERFACE64
547+
ifneq ($(INTERFACE64), 0)
506548
FCOMMON_OPT += -fdefault-integer-8
507549
endif
550+
endif
508551
else
509552
FCOMMON_OPT += -m32
510553
endif
@@ -517,8 +560,10 @@ endif
517560
ifeq ($(F_COMPILER), INTEL)
518561
CCOMMON_OPT += -DF_INTERFACE_INTEL
519562
ifdef INTERFACE64
563+
ifneq ($(INTERFACE64), 0)
520564
FCOMMON_OPT += -i8
521565
endif
566+
endif
522567
ifdef USE_OPENMP
523568
FCOMMON_OPT += -openmp
524569
endif
@@ -537,8 +582,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
537582
ifdef BINARY64
538583
FCOMMON_OPT += -q64
539584
ifdef INTERFACE64
585+
ifneq ($(INTERFACE64), 0)
540586
FCOMMON_OPT += -qintsize=8
541587
endif
588+
endif
542589
else
543590
FCOMMON_OPT += -q32
544591
endif
@@ -552,8 +599,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI
552599
COMMON_PROF += -DPGICOMPILER
553600
ifdef BINARY64
554601
ifdef INTERFACE64
602+
ifneq ($(INTERFACE64), 0)
555603
FCOMMON_OPT += -i8
556604
endif
605+
endif
557606
FCOMMON_OPT += -tp p7-64
558607
else
559608
FCOMMON_OPT += -tp p7
@@ -567,9 +616,11 @@ ifeq ($(F_COMPILER), PATHSCALE)
567616
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
568617
ifdef BINARY64
569618
ifdef INTERFACE64
619+
ifneq ($(INTERFACE64), 0)
570620
FCOMMON_OPT += -i8
571621
endif
572622
endif
623+
endif
573624

574625
ifneq ($(ARCH), mips64)
575626
ifndef BINARY64
@@ -594,9 +645,11 @@ ifeq ($(F_COMPILER), OPEN64)
594645
CCOMMON_OPT += -DF_INTERFACE_OPEN64
595646
ifdef BINARY64
596647
ifdef INTERFACE64
648+
ifneq ($(INTERFACE64), 0)
597649
FCOMMON_OPT += -i8
598650
endif
599651
endif
652+
endif
600653

601654
ifeq ($(ARCH), mips64)
602655
ifndef BINARY64
@@ -682,10 +735,12 @@ endif
682735

683736
ifdef BINARY64
684737
ifdef INTERFACE64
738+
ifneq ($(INTERFACE64), 0)
685739
CCOMMON_OPT +=
686740
#-DUSE64BITINT
687741
endif
688742
endif
743+
endif
689744

690745
ifeq ($(NEED_PIC), 1)
691746
ifeq ($(C_COMPILER), IBM)
@@ -718,6 +773,10 @@ ifeq ($(NO_AVX), 1)
718773
CCOMMON_OPT += -DNO_AVX
719774
endif
720775

776+
ifeq ($(BINARY), 32)
777+
CCOMMON_OPT += -DNO_AVX
778+
endif
779+
721780
ifdef SMP
722781
CCOMMON_OPT += -DSMP_SERVER
723782

@@ -872,8 +931,11 @@ endif
872931
LAPACK_CFLAGS = $(CFLAGS)
873932
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
874933
ifdef INTERFACE64
934+
ifneq ($(INTERFACE64), 0)
875935
LAPACK_CFLAGS += -DLAPACK_ILP64
876936
endif
937+
endif
938+
877939
ifdef OS_WINDOWS
878940
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
879941
endif

cpuid_x86.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <string.h>
4141
#include "cpuid.h"
4242

43+
/*
4344
#ifdef NO_AVX
4445
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
4546
#define CORE_HASWELL CORE_NEHALEM
@@ -50,6 +51,7 @@
5051
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
5152
#define CORE_PILEDRIVER CORE_BARCELONA
5253
#endif
54+
*/
5355

5456
#ifndef CPUIDEMU
5557

driver/others/divtable.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
#include "common.h"
4040

4141
#ifdef SMP
42-
#ifndef USE64BITINT
42+
#if !defined(USE64BITINT) || defined(ARCH_X86)
4343
unsigned int blas_quick_divide_table[] = {
4444
0x00000000, 0x00000001, 0x80000001, 0x55555556,
4545
0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,

interface/gemm.c

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272
#endif
7373

7474
#ifndef GEMM_MULTITHREAD_THRESHOLD
75-
# define GEMM_MULTITHREAD_THRESHOLD 4
75+
#define GEMM_MULTITHREAD_THRESHOLD 4
7676
#endif
7777

7878
static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
@@ -400,14 +400,63 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
400400
mode |= (transa << BLAS_TRANSA_SHIFT);
401401
mode |= (transb << BLAS_TRANSB_SHIFT);
402402

403-
args.common = NULL;
403+
int nthreads_max = num_cpu_avail(3);
404+
int nthreads_avail = nthreads_max;
404405

405-
if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
406-
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
407-
args.nthreads = 1;
408-
}else{
409-
args.nthreads = num_cpu_avail(3);
406+
#ifndef COMPLEX
407+
double MNK = (double) args.m * (double) args.n * (double) args.k;
408+
if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
409+
nthreads_max = 1;
410+
else
411+
{
412+
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
413+
{
414+
nthreads_max = 4;
415+
if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
416+
{
417+
nthreads_max = 2;
418+
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
419+
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
420+
if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
421+
}
422+
else
423+
{
424+
if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
425+
}
426+
}
410427
}
428+
#else
429+
double MNK = (double) args.m * (double) args.n * (double) args.k;
430+
if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
431+
nthreads_max = 1;
432+
else
433+
{
434+
if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
435+
{
436+
nthreads_max = 4;
437+
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
438+
{
439+
nthreads_max = 2;
440+
if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
441+
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
442+
if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
443+
}
444+
else
445+
{
446+
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
447+
}
448+
}
449+
}
450+
451+
#endif
452+
args.common = NULL;
453+
454+
if ( nthreads_max > nthreads_avail )
455+
args.nthreads = nthreads_avail;
456+
else
457+
args.nthreads = nthreads_max;
458+
459+
411460
if (args.nthreads == 1) {
412461
#endif
413462

interface/ger.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
7575
blasint incy = *INCY;
7676
blasint lda = *LDA;
7777
FLOAT *buffer;
78-
#ifdef SMP
78+
#ifdef SMPBUG
7979
int nthreads;
8080
#endif
8181

@@ -107,7 +107,7 @@ void CNAME(enum CBLAS_ORDER order,
107107

108108
FLOAT *buffer;
109109
blasint info, t;
110-
#ifdef SMP
110+
#ifdef SMPBUG
111111
int nthreads;
112112
#endif
113113

@@ -167,15 +167,16 @@ void CNAME(enum CBLAS_ORDER order,
167167

168168
buffer = (FLOAT *)blas_memory_alloc(1);
169169

170-
#ifdef SMP
170+
#ifdef SMPBUG
171171
nthreads = num_cpu_avail(2);
172172

173+
173174
if (nthreads == 1) {
174175
#endif
175176

176177
GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer);
177178

178-
#ifdef SMP
179+
#ifdef SMPBUG
179180
} else {
180181

181182
GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);

interface/rotmg.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
6262

6363
#endif
6464

65-
FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp;
65+
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;
6666

6767
if(*dd1 < ZERO)
6868
{

interface/zger.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
109109
blasint incy = *INCY;
110110
blasint lda = *LDA;
111111
FLOAT *buffer;
112-
#ifdef SMP
112+
#ifdef SMPBUG
113113
int nthreads;
114114
#endif
115115

@@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order,
144144

145145
FLOAT *buffer;
146146
blasint info, t;
147-
#ifdef SMP
147+
#ifdef SMPBUG
148148
int nthreads;
149149
#endif
150150

@@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order,
205205

206206
buffer = (FLOAT *)blas_memory_alloc(1);
207207

208-
#ifdef SMP
208+
#ifdef SMPBUG
209209
nthreads = num_cpu_avail(2);
210210

211211
if (nthreads == 1) {
@@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order,
221221
}
222222
#endif
223223

224-
#ifdef SMP
224+
#ifdef SMPBUG
225225

226226
} else {
227227

kernel/x86_64/KERNEL.BARCELONA

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
SGEMVNKERNEL = sgemv_n.S
2+
SGEMVTKERNEL = sgemv_t.S
3+
14
ZGEMVNKERNEL = zgemv_n_dup.S
25
ZGEMVTKERNEL = zgemv_t_dup.S
36

0 commit comments

Comments
 (0)