+/ +/\ +/\. for INT[24]; disable +/@:*"1 on j64

HenryHRich · HenryHRich · commit e94e63b53658 · 2024-02-01T14:45:30.000-05:00
diff --git a/jsrc/ap.c b/jsrc/ap.c
@@ -347,6 +347,10 @@ AHDRP(fn,D,D){I i; \
 PREFIXNAN( pluspfxZ, Z, Z,  zplus, plusZZ  )
 PREFIXPFX( pluspfxX, X, X,  xplus, plusXX ,HDR1JERR; )
 PREFIXPFX( pluspfxQ, Q, Q,  qplus, plusQQ ,HDR1JERR; )
+static OP1XYZ(plus,I,I,I2,pfxplus)
+PREFIXPFX( pluspfxI2, I, I2,  pfxplus, plus1II2I , R EVOK; )
+static OP1XYZ(plus,I,I,I4,pfxplus)
+PREFIXPFX( pluspfxI4, I, I4,  pfxplus, plus1II4I , R EVOK; )
 
 PREFIXPFX(tymespfxD, D, D,  TYMES, tymesDD ,R EVOK;  )
 PREFIXPFX(tymespfxZ, Z, Z,  ztymes, tymesZZ ,R EVOK;)
diff --git a/jsrc/ar.c b/jsrc/ar.c
@@ -262,7 +262,7 @@ REDUCCPFX(tymesinsO, D, I, TYMESO)
 
 AHDRR(plusinsD,D,D){I i;D* RESTRICT y;
   NAN0;
-  // latency of add is 4, so use 8 accumulators for 2 reads per cycle
+  // latency of add is 2, so use 4 accumulators for 2 reads per cycle
   if(d==1){
 #if C_AVX2 || EMU_AVX2
    redprim256rk1(_mm256_add_pd,dzero)
@@ -306,6 +306,14 @@ AHDRR(plusinsD,D,D){I i;D* RESTRICT y;
 REDUCENAN( plusinsZ, Z, Z, zplus, plusZZ )
 REDUCEPFX( plusinsX, X, X, xplus, plusXX, plusXX )
 
+static OP1XYZ(plus,I,I2,I2,pfxplus)
+OP1XYZ(plus,I,I2,I,pfxplus)
+REDUCEPFX( plusinsI2, I, I2, pfxplus, plus1I2I2I, plus1I2II )
+static OP1XYZ(plus,I,I4,I4,pfxplus)
+OP1XYZ(plus,I,I4,I,pfxplus)
+REDUCEPFX( plusinsI4, I, I4, pfxplus, plus1I4I4I, plus1I4II )
+
+
 #if C_AVX2 || EMU_AVX2
 // version for QP with AVX2.  Bandwidth is not an issue, so we accumulate into memory for rank > 1
 I plusinsE(I d,I n,I m,E* RESTRICTI x,E* RESTRICTI z,J jt){I i;  // m is # cells to operate on; n is # items in 1 such cell; d is # atoms in one such item
diff --git a/jsrc/as.c b/jsrc/as.c
@@ -135,6 +135,8 @@ AHDRS(plussfxD,D,D){I i;
 SUFFIXNAN( plussfxZ, Z, Z, zplus, plusZZ )
 SUFFIXPFX( plussfxX, X, X, xplus, plusXX, HDR1JERR; )
 SUFFIXPFX( plussfxQ, Q, Q, qplus, plusQQ, HDR1JERR; )
+SUFFIXPFX( plussfxI2, I, I2, pfxplus, plus1I2II, R EVOK; )
+SUFFIXPFX( plussfxI4, I, I4, pfxplus, plus1I4II, R EVOK; )
 
 SUFFIXPFX(minussfxB, I, B, MINUS, minusBI, R EVOK; )
 SUFFIXNAN(minussfxD, D, D, MINUS, minusDD )
diff --git a/jsrc/j.h b/jsrc/j.h
@@ -1647,6 +1647,8 @@ if(likely(!((I)jtinplace&JTWILLBEOPENED)))z=EPILOGNORET(z); RETF(z); \
 #define NAN1            {if(unlikely(_SW_INVALID&_clearfp())){jsignal(EVNAN); R 0;}}
 #define NAN1V           {if(unlikely(_SW_INVALID&_clearfp())){jsignal(EVNAN); R  ;}}
 #define NANTEST         (_SW_INVALID&_clearfp())
+#define OP1XYZ(name,Tz,Tx,Ty,pfx) I name##1##Tx##Ty##Tz(I one, I d, Tx *x, Ty *y, Tz *z, J jt){DO(d, z[i]=pfx(x[i],y[i]);) R EVOK;}
+
 // for debug only
 // #define NAN1            {if(_SW_INVALID&_clearfp()){fprintf(stderr,"nan error: file %s line %d\n",__FILE__,__LINE__);jsignal(EVNAN); R 0;}}
 // #define NAN1V           {if(_SW_INVALID&_clearfp()){fprintf(stderr,"nan error: file %s line %d\n",__FILE__,__LINE__);jsignal(EVNAN); R  ;}}
diff --git a/jsrc/ja.h b/jsrc/ja.h
@@ -837,6 +837,7 @@ extern void jfree4gmp(void*,size_t);
 #define pdtspvv(x,y)                jtpdtspvv(jt,(x),(y))
 #define pee(a,b,c,d,e    )          jtpee(jt,(a),(b),(c),(d),(e))
 #define pfill(x,y)                  jtpfill(jt,(x),(y))
+#define pfxplus(x,y)                ((x)+(y))
 #define piev(x,y)                   jtpiev(jt,(x),(y))
 #define pind(x,y)                   jtpind(jt,(x),(y))
 #define pinit()                     jtpinit(jt)
diff --git a/jsrc/je.h b/jsrc/je.h
@@ -986,6 +986,8 @@ extern I        level(J,A);
 extern I        levelle(J,A,I);
 extern void     mvc(I,void*,I,void*);
 extern B        nameless(A);
+extern I        plus1I2II(I,I,I2*,I*,I*,J);
+extern I        plus1I4II(I,I,I4*,I*,I*,J);
 extern D        qpf(void);
 extern A        relocate(I,A);
 extern I        remii(I,I);
diff --git a/jsrc/va2.c b/jsrc/va2.c
@@ -9,6 +9,7 @@
 
 // reduce/prefix/suffix routines
 // first word is the maximum valid precision bit index, followed by that many+1 routines for reduce, and then for prefix and suffix.
+// routines are in bit-index order
 // the last routine is always 0 to indicate invalid
 // if there are integer-overflow routine, they comes after the others, in the order rps
 VARPSA rpsnull = {0, {0}};
@@ -131,9 +132,9 @@ static VARPSA rpsminus = {RATX+1 , {
 {(VARPSF)minusinsO,VCVTIP+VD},{(VARPSF)minuspfxO,VCVTIP+VD},{(VARPSF)minussfxO,VCVTIP+VD},  // integer-overflow routines
 }};
 static VARPSA rpsplus = {QPX+1 , {
-{(VARPSF)plusinsB,VCVTIP+VI}, {0}, {(VARPSF)plusinsI,VCVTIP+VI}, {(VARPSF)plusinsD,VCVTIP+VD}, {(VARPSF)plusinsZ,VCVTIP+VZ},        {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {SY_64?(VARPSF)plusinsE:0,VCVTIP+VUNCH},
-{(VARPSF)pluspfxB,VCVTIP+VI}, {0}, {(VARPSF)pluspfxI,VCVTIP+VI}, {(VARPSF)pluspfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)pluspfxZ,VCVTIP+VZ}, {0}, {(VARPSF)pluspfxX,VCVTIP+VX}, {(VARPSF)pluspfxQ,VCVTIP+VQ}, {0}, {0}, {0}, {0}, {0}, {0},
-{(VARPSF)plussfxB,VCVTIP+VI}, {0}, {(VARPSF)plussfxI,VCVTIP+VI}, {(VARPSF)plussfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)plussfxZ,VCVTIP+VZ}, {0}, {(VARPSF)plussfxX,VCVTIP+VX}, {(VARPSF)plussfxQ,VCVTIP+VQ}, {0}, {0}, {0}, {0}, {0}, {0},
+{(VARPSF)plusinsB,VCVTIP+VI}, {0}, {(VARPSF)plusinsI,VCVTIP+VI}, {(VARPSF)plusinsD,VCVTIP+VD}, {(VARPSF)plusinsZ,VCVTIP+VZ},        {0}, {0}, {0}, {0}, {(VARPSF)plusinsI2,VCVTIP+VI}, {(VARPSF)plusinsI4,VCVTIP+VI}, {0}, {0}, {SY_64?(VARPSF)plusinsE:0,VCVTIP+VUNCH},
+{(VARPSF)pluspfxB,VCVTIP+VI}, {0}, {(VARPSF)pluspfxI,VCVTIP+VI}, {(VARPSF)pluspfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)pluspfxZ,VCVTIP+VZ}, {0}, {(VARPSF)pluspfxX,VCVTIP+VX}, {(VARPSF)pluspfxQ,VCVTIP+VQ}, {0}, {(VARPSF)pluspfxI2,VCVTIP+VI}, {(VARPSF)pluspfxI4,VCVTIP+VI}, {0}, {0}, {0},
+{(VARPSF)plussfxB,VCVTIP+VI}, {0}, {(VARPSF)plussfxI,VCVTIP+VI}, {(VARPSF)plussfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)plussfxZ,VCVTIP+VZ}, {0}, {(VARPSF)plussfxX,VCVTIP+VX}, {(VARPSF)plussfxQ,VCVTIP+VQ}, {0}, {(VARPSF)plussfxI2,VCVTIP+VI}, {(VARPSF)plussfxI4,VCVTIP+VI}, {0}, {0}, {0},
 {(VARPSF)plusinsO,VCVTIP+VD},{(VARPSF)pluspfxO,VCVTIP+VD},{(VARPSF)plussfxO,VCVTIP+VD},  // integer-overflow routines
 }};
 static VARPSA rpstymes = {RATX+1 , {
@@ -976,7 +977,7 @@ I jtsumattymesprods(J jt,I it,void *avp, void *wvp,I dplen,I nfro,I nfri,I ndpo,
  R 1;
 }
 
-#if C_AVX2 || EMU_AVX2
+#if (C_AVX2 || EMU_AVX2) & HASFMA
 // +/@:*"1 for QP, with IRS by hand
 static DF2(jtsumattymes1E){
  if(unlikely((I)((1-AR(a))|(1-AR(w)))<0)){I lr=MIN((RANKT)jt->ranks,AR(a)); I rr=MIN(jt->ranks>>RANKTX,AR(w)); R rank2ex(a,w,(A)self,1,1,lr,rr,jtsumattymes1E);}  // if multiple results needed, do rank loop
@@ -1027,7 +1028,7 @@ DF2(jtsumattymes1){
  // if an argument is empty, sparse, or not a fast arithmetic type, or only one arg has rank 0, revert to the code for f/@:g atomic
  if(((-((AT(a)|AT(w))&((NOUN|SPARSE)&~(B01|INT|FL))))|(AN(a)-1)|(AN(w)-1)|((acr-1)^(wcr-1)))<0) { // test for all unusual cases
   ASSERT(fit!=2,EVNONCE)  // user expected 2 atoms per result, but we don't support that for repeated atomic arg
-#if C_AVX2 || EMU_AVX2   // high-perf QP only on 64-bit
+#if (C_AVX2 || EMU_AVX2) & HASFMA   // high-perf QP only on 64-bit
   if(ISDENSETYPE(AT(a)|AT(w),QP)&&((AN(a)-1)|(AN(w)-1)|(acr-1)|(wcr-1))>=0){
    // QP dot-product.  Transfer to that code with rank still set
    if(unlikely(!(AT(a)&QP)))RZ(a=cvt(QP,a)) else if(unlikely(!(AT(w)&QP)))RZ(w=cvt(QP,w))  // convert lower arg to qp
diff --git a/jsrc/ve.h b/jsrc/ve.h
@@ -80,7 +80,9 @@ extern ADECLP(  pluspfxO,D, I );  extern ADECLR(  plusinsO,D, I );  extern ADECL
 extern ADECLP(  pluspfxQ,Q, Q );                                    extern ADECLS(  plussfxQ,Q, Q );
 extern ADECLP(  pluspfxX,X, X );                                    extern ADECLS(  plussfxX,X, X );
 extern ADECLP(  pluspfxZ,Z, Z );  extern ADECLR(  plusinsZ,Z, Z );  extern ADECLS(  plussfxZ,Z, Z ); 
-extern ADECLR(  plusinsE,E, E );  
+extern ADECLR(  plusinsE,E, E );  extern ADECLR(  plusinsI2,I, I2 );   extern ADECLR(  plusinsI4,I, I4 );  
+extern ADECLR(  pluspfxI2,I, I2 );   extern ADECLR(  pluspfxI4,I, I4 );  
+extern ADECLR(  plussfxI2,I, I2 );   extern ADECLR(  plussfxI4,I, I4 );  
 extern ADECLP( tymespfxD,D, D );  extern ADECLR( tymesinsD,D, D );  extern ADECLS( tymessfxD,D, D );
 extern ADECLP( tymespfxI,I, I );  extern ADECLR( tymesinsI,I, I );  extern ADECLS( tymessfxI,I, I );
 extern ADECLP( tymespfxO,D, I );  extern ADECLR( tymesinsO,D, I );  extern ADECLS( tymessfxO,D, I );
diff --git a/test/gj.ijs b/test/gj.ijs
@@ -290,7 +290,15 @@ NB. int2 and int4
 
 100100 = (6 c. 100) + (7 c. 100000)
 
-
+{{
+xx =: y ?@$ 1000
+assert. ((-: *. 1&=@:#@] +. -:&(3!:0)) &:(+/) 6&c.) xx
+assert. ((-: *. 1&=@:#@] +. -:&(3!:0))&:(+/) 7&c.) xx
+assert. ((-: *. 1&>:@:#@] +.  -:&(3!:0))&:(+/\) 6&c.) xx
+assert. ((-: *. 1&>:@:#@] +. -:&(3!:0))&:(+/\) 7&c.) xx
+assert. ((-: *. 1&>:@:#@] +. -:&(3!:0))&:(+/\.) 6&c.) xx
+assert. ((-: *. 1&>:@:#@] +. -:&(3!:0))&:(+/\.) 7&c.) xx
+1 }} &> (<"0 i. 50) , <"1 (2 6 ?@$ 30)
 
 
 4!:55 ;:'a argrand argnear b c carg d f f2 fsmall jdot p t xd yd dx dy xx xy qx qy s xs ys'