|
9 | 9 |
|
10 | 10 | // reduce/prefix/suffix routines |
11 | 11 | // first word is the maximum valid precision bit index, followed by that many+1 routines for reduce, and then for prefix and suffix. |
| 12 | +// routines are in bit-index order |
12 | 13 | // the last routine is always 0 to indicate invalid |
13 | 14 | // if there are integer-overflow routine, they comes after the others, in the order rps |
14 | 15 | VARPSA rpsnull = {0, {0}}; |
@@ -131,9 +132,9 @@ static VARPSA rpsminus = {RATX+1 , { |
131 | 132 | {(VARPSF)minusinsO,VCVTIP+VD},{(VARPSF)minuspfxO,VCVTIP+VD},{(VARPSF)minussfxO,VCVTIP+VD}, // integer-overflow routines |
132 | 133 | }}; |
133 | 134 | static VARPSA rpsplus = {QPX+1 , { |
134 | | -{(VARPSF)plusinsB,VCVTIP+VI}, {0}, {(VARPSF)plusinsI,VCVTIP+VI}, {(VARPSF)plusinsD,VCVTIP+VD}, {(VARPSF)plusinsZ,VCVTIP+VZ}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, {SY_64?(VARPSF)plusinsE:0,VCVTIP+VUNCH}, |
135 | | -{(VARPSF)pluspfxB,VCVTIP+VI}, {0}, {(VARPSF)pluspfxI,VCVTIP+VI}, {(VARPSF)pluspfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)pluspfxZ,VCVTIP+VZ}, {0}, {(VARPSF)pluspfxX,VCVTIP+VX}, {(VARPSF)pluspfxQ,VCVTIP+VQ}, {0}, {0}, {0}, {0}, {0}, {0}, |
136 | | -{(VARPSF)plussfxB,VCVTIP+VI}, {0}, {(VARPSF)plussfxI,VCVTIP+VI}, {(VARPSF)plussfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)plussfxZ,VCVTIP+VZ}, {0}, {(VARPSF)plussfxX,VCVTIP+VX}, {(VARPSF)plussfxQ,VCVTIP+VQ}, {0}, {0}, {0}, {0}, {0}, {0}, |
| 135 | +{(VARPSF)plusinsB,VCVTIP+VI}, {0}, {(VARPSF)plusinsI,VCVTIP+VI}, {(VARPSF)plusinsD,VCVTIP+VD}, {(VARPSF)plusinsZ,VCVTIP+VZ}, {0}, {0}, {0}, {0}, {(VARPSF)plusinsI2,VCVTIP+VI}, {(VARPSF)plusinsI4,VCVTIP+VI}, {0}, {0}, {SY_64?(VARPSF)plusinsE:0,VCVTIP+VUNCH}, |
| 136 | +{(VARPSF)pluspfxB,VCVTIP+VI}, {0}, {(VARPSF)pluspfxI,VCVTIP+VI}, {(VARPSF)pluspfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)pluspfxZ,VCVTIP+VZ}, {0}, {(VARPSF)pluspfxX,VCVTIP+VX}, {(VARPSF)pluspfxQ,VCVTIP+VQ}, {0}, {(VARPSF)pluspfxI2,VCVTIP+VI}, {(VARPSF)pluspfxI4,VCVTIP+VI}, {0}, {0}, {0}, |
| 137 | +{(VARPSF)plussfxB,VCVTIP+VI}, {0}, {(VARPSF)plussfxI,VCVTIP+VI}, {(VARPSF)plussfxD,VCVTIP+VD+VIPOKW}, {(VARPSF)plussfxZ,VCVTIP+VZ}, {0}, {(VARPSF)plussfxX,VCVTIP+VX}, {(VARPSF)plussfxQ,VCVTIP+VQ}, {0}, {(VARPSF)plussfxI2,VCVTIP+VI}, {(VARPSF)plussfxI4,VCVTIP+VI}, {0}, {0}, {0}, |
137 | 138 | {(VARPSF)plusinsO,VCVTIP+VD},{(VARPSF)pluspfxO,VCVTIP+VD},{(VARPSF)plussfxO,VCVTIP+VD}, // integer-overflow routines |
138 | 139 | }}; |
139 | 140 | static VARPSA rpstymes = {RATX+1 , { |
@@ -976,7 +977,7 @@ I jtsumattymesprods(J jt,I it,void *avp, void *wvp,I dplen,I nfro,I nfri,I ndpo, |
976 | 977 | R 1; |
977 | 978 | } |
978 | 979 |
|
979 | | -#if C_AVX2 || EMU_AVX2 |
| 980 | +#if (C_AVX2 || EMU_AVX2) & HASFMA |
980 | 981 | // +/@:*"1 for QP, with IRS by hand |
981 | 982 | static DF2(jtsumattymes1E){ |
982 | 983 | if(unlikely((I)((1-AR(a))|(1-AR(w)))<0)){I lr=MIN((RANKT)jt->ranks,AR(a)); I rr=MIN(jt->ranks>>RANKTX,AR(w)); R rank2ex(a,w,(A)self,1,1,lr,rr,jtsumattymes1E);} // if multiple results needed, do rank loop |
@@ -1027,7 +1028,7 @@ DF2(jtsumattymes1){ |
1027 | 1028 | // if an argument is empty, sparse, or not a fast arithmetic type, or only one arg has rank 0, revert to the code for f/@:g atomic |
1028 | 1029 | if(((-((AT(a)|AT(w))&((NOUN|SPARSE)&~(B01|INT|FL))))|(AN(a)-1)|(AN(w)-1)|((acr-1)^(wcr-1)))<0) { // test for all unusual cases |
1029 | 1030 | ASSERT(fit!=2,EVNONCE) // user expected 2 atoms per result, but we don't support that for repeated atomic arg |
1030 | | -#if C_AVX2 || EMU_AVX2 // high-perf QP only on 64-bit |
| 1031 | +#if (C_AVX2 || EMU_AVX2) & HASFMA // high-perf QP only on 64-bit |
1031 | 1032 | if(ISDENSETYPE(AT(a)|AT(w),QP)&&((AN(a)-1)|(AN(w)-1)|(acr-1)|(wcr-1))>=0){ |
1032 | 1033 | // QP dot-product. Transfer to that code with rank still set |
1033 | 1034 | if(unlikely(!(AT(a)&QP)))RZ(a=cvt(QP,a)) else if(unlikely(!(AT(w)&QP)))RZ(w=cvt(QP,w)) // convert lower arg to qp |
|
0 commit comments