Skip to content

Commit e94e63b

Browse files
committed
+/ +/\ +/\. for INT[24]; disable +/@:*"1 on j64
1 parent aa17070 commit e94e63b

9 files changed

Lines changed: 38 additions & 8 deletions

File tree

jsrc/ap.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,10 @@ AHDRP(fn,D,D){I i; \
347347
PREFIXNAN( pluspfxZ, Z, Z, zplus, plusZZ )
348348
PREFIXPFX( pluspfxX, X, X, xplus, plusXX ,HDR1JERR; )
349349
PREFIXPFX( pluspfxQ, Q, Q, qplus, plusQQ ,HDR1JERR; )
350+
static OP1XYZ(plus,I,I,I2,pfxplus)
351+
PREFIXPFX( pluspfxI2, I, I2, pfxplus, plus1II2I , R EVOK; )
352+
static OP1XYZ(plus,I,I,I4,pfxplus)
353+
PREFIXPFX( pluspfxI4, I, I4, pfxplus, plus1II4I , R EVOK; )
350354

351355
PREFIXPFX(tymespfxD, D, D, TYMES, tymesDD ,R EVOK; )
352356
PREFIXPFX(tymespfxZ, Z, Z, ztymes, tymesZZ ,R EVOK;)

jsrc/ar.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ REDUCCPFX(tymesinsO, D, I, TYMESO)
262262

263263
AHDRR(plusinsD,D,D){I i;D* RESTRICT y;
264264
NAN0;
265-
// latency of add is 4, so use 8 accumulators for 2 reads per cycle
265+
// latency of add is 2, so use 4 accumulators for 2 reads per cycle
266266
if(d==1){
267267
#if C_AVX2 || EMU_AVX2
268268
redprim256rk1(_mm256_add_pd,dzero)
@@ -306,6 +306,14 @@ AHDRR(plusinsD,D,D){I i;D* RESTRICT y;
306306
REDUCENAN( plusinsZ, Z, Z, zplus, plusZZ )
307307
REDUCEPFX( plusinsX, X, X, xplus, plusXX, plusXX )
308308

309+
static OP1XYZ(plus,I,I2,I2,pfxplus)
310+
OP1XYZ(plus,I,I2,I,pfxplus)
311+
REDUCEPFX( plusinsI2, I, I2, pfxplus, plus1I2I2I, plus1I2II )
312+
static OP1XYZ(plus,I,I4,I4,pfxplus)
313+
OP1XYZ(plus,I,I4,I,pfxplus)
314+
REDUCEPFX( plusinsI4, I, I4, pfxplus, plus1I4I4I, plus1I4II )
315+
316+
309317
#if C_AVX2 || EMU_AVX2
310318
// version for QP with AVX2. Bandwidth is not an issue, so we accumulate into memory for rank > 1
311319
I plusinsE(I d,I n,I m,E* RESTRICTI x,E* RESTRICTI z,J jt){I i; // m is # cells to operate on; n is # items in 1 such cell; d is # atoms in one such item

jsrc/as.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ AHDRS(plussfxD,D,D){I i;
135135
SUFFIXNAN( plussfxZ, Z, Z, zplus, plusZZ )
136136
SUFFIXPFX( plussfxX, X, X, xplus, plusXX, HDR1JERR; )
137137
SUFFIXPFX( plussfxQ, Q, Q, qplus, plusQQ, HDR1JERR; )
138+
SUFFIXPFX( plussfxI2, I, I2, pfxplus, plus1I2II, R EVOK; )
139+
SUFFIXPFX( plussfxI4, I, I4, pfxplus, plus1I4II, R EVOK; )
138140

139141
SUFFIXPFX(minussfxB, I, B, MINUS, minusBI, R EVOK; )
140142
SUFFIXNAN(minussfxD, D, D, MINUS, minusDD )

jsrc/j.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1647,6 +1647,8 @@ if(likely(!((I)jtinplace&JTWILLBEOPENED)))z=EPILOGNORET(z); RETF(z); \
16471647
#define NAN1 {if(unlikely(_SW_INVALID&_clearfp())){jsignal(EVNAN); R 0;}}
16481648
#define NAN1V {if(unlikely(_SW_INVALID&_clearfp())){jsignal(EVNAN); R ;}}
16491649
#define NANTEST (_SW_INVALID&_clearfp())
1650+
#define OP1XYZ(name,Tz,Tx,Ty,pfx) I name##1##Tx##Ty##Tz(I one, I d, Tx *x, Ty *y, Tz *z, J jt){DO(d, z[i]=pfx(x[i],y[i]);) R EVOK;}
1651+
16501652
// for debug only
16511653
// #define NAN1 {if(_SW_INVALID&_clearfp()){fprintf(stderr,"nan error: file %s line %d\n",__FILE__,__LINE__);jsignal(EVNAN); R 0;}}
16521654
// #define NAN1V {if(_SW_INVALID&_clearfp()){fprintf(stderr,"nan error: file %s line %d\n",__FILE__,__LINE__);jsignal(EVNAN); R ;}}

jsrc/ja.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,7 @@ extern void jfree4gmp(void*,size_t);
837837
#define pdtspvv(x,y) jtpdtspvv(jt,(x),(y))
838838
#define pee(a,b,c,d,e ) jtpee(jt,(a),(b),(c),(d),(e))
839839
#define pfill(x,y) jtpfill(jt,(x),(y))
840+
#define pfxplus(x,y) ((x)+(y))
840841
#define piev(x,y) jtpiev(jt,(x),(y))
841842
#define pind(x,y) jtpind(jt,(x),(y))
842843
#define pinit() jtpinit(jt)

jsrc/je.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -986,6 +986,8 @@ extern I level(J,A);
986986
extern I levelle(J,A,I);
987987
extern void mvc(I,void*,I,void*);
988988
extern B nameless(A);
989+
extern I plus1I2II(I,I,I2*,I*,I*,J);
990+
extern I plus1I4II(I,I,I4*,I*,I*,J);
989991
extern D qpf(void);
990992
extern A relocate(I,A);
991993
extern I remii(I,I);

jsrc/va2.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
// reduce/prefix/suffix routines
1111
// first word is the maximum valid precision bit index, followed by that many+1 routines for reduce, and then for prefix and suffix.
12+
// routines are in bit-index order
1213
// the last routine is always 0 to indicate invalid
1314
// if there are integer-overflow routine, they comes after the others, in the order rps
1415
VARPSA rpsnull = {0, {0}};
@@ -131,9 +132,9 @@ static VARPSA rpsminus = {RATX+1 , {
131132
{(VARPSF)minusinsO,VCVTIP+VD},{(VARPSF)minuspfxO,VCVTIP+VD},{(VARPSF)minussfxO,VCVTIP+VD}, // integer-overflow routines
132133
}};
133134
static VARPSA rpsplus = {QPX+1 , {
134-
{(VARPSF)plusinsB,VCVTIP+VI}, {0}, {(VARPSF)plusinsI,VCVTIP+VI}, {(VARPSF)plusinsD,VCVTIP+VD}, {(VARPSF)plusinsZ,VCVTIP+VZ}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {SY_64?(VARPSF)plusinsE:0,VCVTIP+VUNCH},
135-
{(VARPSF)pluspfxB,VCVTIP+VI}, {0}, {(VARPSF)pluspfxI,VCVTIP+VI}, {(VARPSF)pluspfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)pluspfxZ,VCVTIP+VZ}, {0}, {(VARPSF)pluspfxX,VCVTIP+VX}, {(VARPSF)pluspfxQ,VCVTIP+VQ}, {0}, {0}, {0}, {0}, {0}, {0},
136-
{(VARPSF)plussfxB,VCVTIP+VI}, {0}, {(VARPSF)plussfxI,VCVTIP+VI}, {(VARPSF)plussfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)plussfxZ,VCVTIP+VZ}, {0}, {(VARPSF)plussfxX,VCVTIP+VX}, {(VARPSF)plussfxQ,VCVTIP+VQ}, {0}, {0}, {0}, {0}, {0}, {0},
135+
{(VARPSF)plusinsB,VCVTIP+VI}, {0}, {(VARPSF)plusinsI,VCVTIP+VI}, {(VARPSF)plusinsD,VCVTIP+VD}, {(VARPSF)plusinsZ,VCVTIP+VZ}, {0}, {0}, {0}, {0}, {(VARPSF)plusinsI2,VCVTIP+VI}, {(VARPSF)plusinsI4,VCVTIP+VI}, {0}, {0}, {SY_64?(VARPSF)plusinsE:0,VCVTIP+VUNCH},
136+
{(VARPSF)pluspfxB,VCVTIP+VI}, {0}, {(VARPSF)pluspfxI,VCVTIP+VI}, {(VARPSF)pluspfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)pluspfxZ,VCVTIP+VZ}, {0}, {(VARPSF)pluspfxX,VCVTIP+VX}, {(VARPSF)pluspfxQ,VCVTIP+VQ}, {0}, {(VARPSF)pluspfxI2,VCVTIP+VI}, {(VARPSF)pluspfxI4,VCVTIP+VI}, {0}, {0}, {0},
137+
{(VARPSF)plussfxB,VCVTIP+VI}, {0}, {(VARPSF)plussfxI,VCVTIP+VI}, {(VARPSF)plussfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)plussfxZ,VCVTIP+VZ}, {0}, {(VARPSF)plussfxX,VCVTIP+VX}, {(VARPSF)plussfxQ,VCVTIP+VQ}, {0}, {(VARPSF)plussfxI2,VCVTIP+VI}, {(VARPSF)plussfxI4,VCVTIP+VI}, {0}, {0}, {0},
137138
{(VARPSF)plusinsO,VCVTIP+VD},{(VARPSF)pluspfxO,VCVTIP+VD},{(VARPSF)plussfxO,VCVTIP+VD}, // integer-overflow routines
138139
}};
139140
static VARPSA rpstymes = {RATX+1 , {
@@ -976,7 +977,7 @@ I jtsumattymesprods(J jt,I it,void *avp, void *wvp,I dplen,I nfro,I nfri,I ndpo,
976977
R 1;
977978
}
978979

979-
#if C_AVX2 || EMU_AVX2
980+
#if (C_AVX2 || EMU_AVX2) & HASFMA
980981
// +/@:*"1 for QP, with IRS by hand
981982
static DF2(jtsumattymes1E){
982983
if(unlikely((I)((1-AR(a))|(1-AR(w)))<0)){I lr=MIN((RANKT)jt->ranks,AR(a)); I rr=MIN(jt->ranks>>RANKTX,AR(w)); R rank2ex(a,w,(A)self,1,1,lr,rr,jtsumattymes1E);} // if multiple results needed, do rank loop
@@ -1027,7 +1028,7 @@ DF2(jtsumattymes1){
10271028
// if an argument is empty, sparse, or not a fast arithmetic type, or only one arg has rank 0, revert to the code for f/@:g atomic
10281029
if(((-((AT(a)|AT(w))&((NOUN|SPARSE)&~(B01|INT|FL))))|(AN(a)-1)|(AN(w)-1)|((acr-1)^(wcr-1)))<0) { // test for all unusual cases
10291030
ASSERT(fit!=2,EVNONCE) // user expected 2 atoms per result, but we don't support that for repeated atomic arg
1030-
#if C_AVX2 || EMU_AVX2 // high-perf QP only on 64-bit
1031+
#if (C_AVX2 || EMU_AVX2) & HASFMA // high-perf QP only on 64-bit
10311032
if(ISDENSETYPE(AT(a)|AT(w),QP)&&((AN(a)-1)|(AN(w)-1)|(acr-1)|(wcr-1))>=0){
10321033
// QP dot-product. Transfer to that code with rank still set
10331034
if(unlikely(!(AT(a)&QP)))RZ(a=cvt(QP,a)) else if(unlikely(!(AT(w)&QP)))RZ(w=cvt(QP,w)) // convert lower arg to qp

jsrc/ve.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@ extern ADECLP( pluspfxO,D, I ); extern ADECLR( plusinsO,D, I ); extern ADECL
8080
extern ADECLP( pluspfxQ,Q, Q ); extern ADECLS( plussfxQ,Q, Q );
8181
extern ADECLP( pluspfxX,X, X ); extern ADECLS( plussfxX,X, X );
8282
extern ADECLP( pluspfxZ,Z, Z ); extern ADECLR( plusinsZ,Z, Z ); extern ADECLS( plussfxZ,Z, Z );
83-
extern ADECLR( plusinsE,E, E );
83+
extern ADECLR( plusinsE,E, E ); extern ADECLR( plusinsI2,I, I2 ); extern ADECLR( plusinsI4,I, I4 );
84+
extern ADECLR( pluspfxI2,I, I2 ); extern ADECLR( pluspfxI4,I, I4 );
85+
extern ADECLR( plussfxI2,I, I2 ); extern ADECLR( plussfxI4,I, I4 );
8486
extern ADECLP( tymespfxD,D, D ); extern ADECLR( tymesinsD,D, D ); extern ADECLS( tymessfxD,D, D );
8587
extern ADECLP( tymespfxI,I, I ); extern ADECLR( tymesinsI,I, I ); extern ADECLS( tymessfxI,I, I );
8688
extern ADECLP( tymespfxO,D, I ); extern ADECLR( tymesinsO,D, I ); extern ADECLS( tymessfxO,D, I );

test/gj.ijs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,15 @@ NB. int2 and int4
290290

291291
100100 = (6 c. 100) + (7 c. 100000)
292292

293-
293+
{{
294+
xx =: y ?@$ 1000
295+
assert. ((-: *. 1&=@:#@] +. -:&(3!:0)) &:(+/) 6&c.) xx
296+
assert. ((-: *. 1&=@:#@] +. -:&(3!:0))&:(+/) 7&c.) xx
297+
assert. ((-: *. 1&>:@:#@] +. -:&(3!:0))&:(+/\) 6&c.) xx
298+
assert. ((-: *. 1&>:@:#@] +. -:&(3!:0))&:(+/\) 7&c.) xx
299+
assert. ((-: *. 1&>:@:#@] +. -:&(3!:0))&:(+/\.) 6&c.) xx
300+
assert. ((-: *. 1&>:@:#@] +. -:&(3!:0))&:(+/\.) 7&c.) xx
301+
1 }} &> (<"0 i. 50) , <"1 (2 6 ?@$ 30)
294302

295303

296304
4!:55 ;:'a argrand argnear b c carg d f f2 fsmall jdot p t xd yd dx dy xx xy qx qy s xs ys'

0 commit comments

Comments
 (0)