You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if(n2==1)goto oneloop; // if we don't have to loop here, avoid the data-dependent branch and fold the comparisons into the last batch
118
118
// obsolete if(~_mm256_movemask_epi8(allmatches))goto fail; // if searches are long, kick out when there is a miscompare
119
-
if(!_mm256_testc_pd(_mm256_castsi256_pd(allmatches),ones))goto fail; // if searches are long, kick out when there is a miscompare. test is '!(all bits of allmatches =1)'
119
+
if(!_mm256_testc_pd(_mm256_castsi256_pd(allmatches),ones))goto fail; // if searches are long, kick out when there is a miscompare. test is '!(all sign bits of allmatches =1)'
120
120
}while(--i>0);
121
121
}
122
122
oneloop:;
@@ -150,7 +150,7 @@ I memcmpne(void *s, void *t, I l){
150
150
151
151
UIn2=DUFFLPCT(n-1,3); /* # turns through duff loop */
152
152
if(n2>0){
153
-
__m256iallmatches=ones; // accumuland for compares init to all 1
153
+
__m256iallmatches=_mm256_castpd_si256(ones); // accumuland for compares init to all 1
154
154
UIbackoff=DUFFBACKOFF(n-1,3);
155
155
x+=(backoff+1)*NPAR; y+=(backoff+1)*NPAR;
156
156
switch(backoff){
@@ -165,7 +165,7 @@ I memcmpne(void *s, void *t, I l){
#defineONEPRODD D total0=0.0; D total1=0.0; if(dplen&1)total1=(D)*av++*(D)*wv++; DQ(dplen>>1, total0+=(D)*av++*(D)*wv++; total1+=(D)*av++*(D)*wv++;); *zv++=total0+total1;
938
938
#endif
@@ -1088,15 +1088,15 @@ DF2(jtsumattymes1){
1088
1088
// the largest intermediate total encountered; sometimes we get a little more.
if(unlikely(nc==0))R0; // abort with no action if no columns (possible only for DIP)
726
726
if(bv!=0&&prirow>=0)zv=0; // if we have DIP with a priority row, signal to process ALL rows in bk order. It's not really needed but it ensures that if the prirow is tied for pivot in the first
727
727
// column, we will take it
728
-
if(bv==0)thresh=_mm256_permute4x64_pd(thresh,0b00000000); // for Dpiv, repurpose thresh to all thresholds
728
+
if(bv==0)thresh=_mm256_permute4x64_pd(thresh,0b00000000); // for Dpiv or one-column, repurpose thresh to all thresholds
__m256dforce0=_mm256_cmp_pd(_mm256_andnot_pd(sgnbit,dotproducth),thresh,_CMP_LT_OQ); // 1s where we need to clamp
875
+
dotproducth=_mm256_blendv_pd(dotproducth,_mm256_setzero_pd(),force0); // set values < threshold to +0
874
876
if(likely(_mm256_testc_pd(endmask,sgnbit)))_mm256_storeu_pd(zv,dotproducth);else_mm256_maskstore_pd(zv,endmask,dotproducth); // store, masking if needed
875
877
if(limitrow==-3){
878
+
dotproductl=_mm256_blendv_pd(dotproductl,_mm256_setzero_pd(),force0); // set values < threshold to +0
876
879
if(likely(_mm256_testc_pd(endmask,sgnbit)))_mm256_storeu_pd(zv+n,dotproductl);else_mm256_maskstore_pd(zv+n,endmask,dotproductl); // repeat for low part
// 128!:9 matrix times sparse vector with optional early exit
1103
1106
// product mode:
1104
-
// y is ndx;Ax;Am;Av;(M, shape m,n) where ndx is an atom
1107
+
// y is ndx;Ax;Am;Av;(M, shape m,n);threshold where ndx is an atom
1105
1108
// if ndx<m, the column is ndx {"1 M; otherwise ((((ndx-m){Ax) ];.0 Am) {"1 M) +/@:*"1 ((ndx-m){Ax) ];.0 Av
1106
-
// if M has rank 3 (with 2={.$M), do the product in extended precision
1107
-
// Result for product mode (exitvec is scalar) is the product, one column of M
1109
+
// if M has rank 3 (with 2={.$M), do the product in quad precision
1110
+
// Result for product mode (exitvec is scalar) is the product, one column of M. Values closer to 0 than the threshold are clamped to 0
1108
1111
// DIP/Dpiv mode:
1112
+
// Only the high part of M is used if it is quad-precision
1109
1113
// y is ndx;Ax;Am;Av;(M, shape m,n);bkgrd;(ColThreshold/PivTol,MinPivot,bkmin,NFreeCols,NCols,ImpFac,Virtx/Dpivdir);bk/'';Frow[;exclusion list/Dpiv;Yk]
1110
1114
// Result is rc,best row,best col,#cols scanned,#dot-products evaluated,best gain (if rc e. 0 1 2)
1111
1115
// rc,failing column of NTT, an element of ndx (if rc=4)
__m256dthresh; // ColThr Inf bkmin MinPivot validity thresholds, small positive values - for one-column mode, all lanes have the threshold for zero-clamp
1158
1162
Ibestcol=1LL<<(BW-1), bestcolrow=0; // col# and row#+mask for best value found from previous column, init to no col found, and best value 'dangerous or not found'
1159
1163
// obsolete 1LL<<(32+3);
1160
1164
Az; D*zv; D*Frow; // pointer to output for product mode, Frow
// single index value. set bv=0, zv non0 as a flag that we are storing the column
1169
-
bv=0; ASSERT(AN(w)==5,EVLENGTH); // if goodvec is an atom, set bv=0 to indicate that bv is not used and verify no more input
1173
+
bv=0; ASSERT(AN(w)==6,EVLENGTH); // if goodvec is an atom, set bv=0 to indicate that bv is not used and verify no more input
1170
1174
if(unlikely(n==0)){Rreshape(drop(num(-1),shape(C(AAV(w)[4]))),zeroionei(0));} // empty M, each product is 0
1175
+
ASSERT(AR(C(AAV(w)[5]))==0,EVRANK); ASSERT(AT(C(AAV(w)[5]))&FL,EVDOMAIN); // thresh must be a float atom
1171
1176
Iepcol=AR(C(AAV(w)[4]))==3; // flag if we are doing an extended-precision column fetch
1172
1177
GATV(z,FL,n<<epcol,1+epcol,AS(C(AAV(w)[4]))); zv=DAV(z); // allocate the result area for column extraction. Set zv nonzero so we use bkgrd of i. #M
1173
1178
bvgrd0=0; bvgrde=bvgrd0+n; // length of column is #M
1179
+
thresh=_mm256_set1_pd(DAV(C(AAV(w)[5]))[0]); // load threshold in all lanes
1174
1180
}else{
1175
1181
// A list of index values. We are doing the DIP calculation or Dpiv
1176
1182
ASSERT(AR(C(AAV(w)[5]))==1,EVRANK); ASSERT(AN(C(AAV(w)[5]))==0||AT(C(AAV(w)[5]))&INT,EVDOMAIN); bvgrd0=IAV(C(AAV(w)[5])); bvgrde=bvgrd0+AN(C(AAV(w)[5])); // bkgrd: the order of processing the rows, and end+1 ptr normally /: bk
if(AN(C(AAV(w)[8]))==0)Frow=0;else{ASSERT(AT(C(AAV(w)[8]))&FL,EVDOMAIN); ASSERT(AN(C(AAV(w)[8]))==AS(C(AAV(w)[4]))[0]+AS(C(AAV(w)[1]))[0],EVLENGTH); Frow=DAV(C(AAV(w)[8]));} // if Frow omitted we are looking to make bks nonzero
D*qkv=DAV(qk); Iqksize=AS(qk)[AR(qk)-1]; Iqksizesq=qksize*qksize; dpflag|=AR(qk)>2; // pointer to qk data, length of a row, offset to low part if present
1272
+
D*qkv=DAV(qk); Iqksize=AS(qk)[AR(qk)-1]; It=AR(prx)+1; t=(t!=1)?qksize:t; Iqksizesq=qksize*t; dpflag|=AR(qk)>AR(prx)+1; // pointer to qk data, length of a row, offset to low part if present. offset is qksize^2, or bksize
1267
1273
UIrowx=0, rown=AN(prx); I*rowxv=IAV(prx); D*pcn0v=DAV(pivotcolnon0); dpflag|=(AR(pivotcolnon0)>1)<<1; // current row, # rows, address of row indexes, column data
1268
1274
UIcoln=AN(pcx); I*colxv=IAV(pcx); D*prn0v=DAV(newrownon0); dpflag|=(AR(newrownon0)>1)<<2; // # cols, address of col indexes. row data
ASSERT(AR(newrownon0)==1||AS(newrownon0)[0]==2,EVLENGTH) // newrownon0 is float or extended list
1377
+
ASSERT(AR(newrownon0)==1||AS(newrownon0)[0]==2,EVLENGTH) // newrownon0 is float or extended list
1373
1378
Atmp=AAV(a)[4]; if(!(AT(tmp)&FL))RZ(tmp=cvt(FL,tmp)); ASSERT(AR(tmp)==0,EVRANK) Drelfuzz=DAV(tmp)[0]; // relfuzz is a float atom
1374
1379
// agreement
1380
+
ASSERT(BETWEENC(AR(w),AR(prx)+1,AR(prx)+2),EVRANK) // Qk is nxn; bk is n, treated as a single row. Each may be quadprec
1381
+
ASSERT(AR(w)==AR(prx)+1||AS(w)[0]==2,EVLENGTH)
1382
+
if(AR(prx)!=0){ASSERT(AS(w)[AR(w)-1]==AS(w)[AR(w)-2],EVLENGTH) DO(AN(prx), ASSERT(IAV(prx)[i]<AS(w)[AR(w)-2],EVINDEX))} else{ASSERT(IAV(prx)[0]==0,EVINDEX)} // Qk must be square; bk not; valid row indexes
1375
1383
ASSERT(AN(prx)==AS(pivotcolnon0)[AR(pivotcolnon0)-1],EVLENGTH) ASSERT(AN(pcx)==AS(newrownon0)[AR(newrownon0)-1],EVLENGTH) // indexes and values must agree
0 commit comments