You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if(unlikely(fit==2))AS(w)[AR(w)-1]=2; // if +/@:*"1!.1, we store two atoms per sum
1040
+
if(unlikely(fit==2))AS(z)[AR(z)-1]=2; // if +/@:*"1!.1, we store two atoms per sum
1041
1041
1042
1042
if(likely(fit==0)){RZ(jtsumattymesprods(jt,it,voidAV(a),voidAV(w),dplen,nfro,nfri,ndpo,ndpi,voidAV(z))); // eval standard dot-product, check for error
1043
1043
}else{
@@ -1046,7 +1046,7 @@ DF2(jtsumattymes1){
1046
1046
#if (C_AVX2&&SY_64) ||EMU_AVX2
1047
1047
#if1// higher precision. Required when a large product is added to a small total. Dependency loop for acc is 4 clocks; for c is 4 clocks. Total 12 insts, so unrolled 2 would do
__m256iendmask; /* length mask for the last word */
@@ -1084,13 +1084,22 @@ DF2(jtsumattymes1){
1084
1084
c0=_mm256_add_pd(c0,c1); c2=_mm256_add_pd(c2,c3); c0=_mm256_add_pd(c0,c2); // add all the low parts together - the low bits of the low will not make it through to the result
TWOSUM(acc0,acc1,acc0,c1); c0=_mm256_add_pd(c0,c1); // combine 0123, combine all low parts
1092
-
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
1093
-
#else
1087
+
// acc0/c0 survive. Combine horizontally. Anything the high part touches must be extended precision; the low in one float. We guarantee extended precision from
1088
+
// the largest intermediate total encountered; sometimes we get a little more.
c0=_mm256_add_pd(c0,_mm256_permute_pd(c0,0xf)); acc1=_mm256_permute_pd(acc0,0xf); // c0[0] has total of all loe parts, acc1=hi1+hi3
1092
+
TWOSUM(acc0,acc1,acc0,c1); c0=_mm256_add_pd(c0,c1); // acc0 has sum of all hi parts, c1 sum of all low parts+extensions
1093
+
if(fit==1){
1094
+
// normal result. Just add the extensions into the hi part
1095
+
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
1096
+
}else{
1097
+
// extended result. We must preserve the extension bits in the total and write them out
1098
+
TWOSUM(acc0,c0,acc0,c1); // extended total
1099
+
zv[1]=_mm256_cvtsd_f64(c1); // store it out
1100
+
1101
+
}
1102
+
#else// obsolete
1094
1103
c0=_mm256_add_pd(c0,c1); c2=_mm256_add_pd(c2,c3); c0=_mm256_add_pd(c0,c2); // add all the low parts together - the low bits of the low will not make it through to the result
1095
1104
acc0=_mm256_add_pd(acc0,acc1); acc2=_mm256_add_pd(acc2,acc3); acc0=_mm256_add_pd(acc0,acc2); // add all the high parts
if(bv!=0&&prirow>=0)zv=0; // if we have DIP with a priority row, signal to process ALL rows in bk order. It's not really needed but it ensures that if the prirow is tied for pivot in the first
759
760
// column, we will take it
760
761
761
-
#defineCOLLPINIT I *bvgrd=bvgrd0; I i=-1; D *mv=mv0-n; D bkold=inf, cold=1.0; I bkle0=1;
762
-
#defineCOLLP do{if(unlikely(zv!=0)){++i; mv+=n;}else{i=*bvgrd; mv=mv0+n*i;} // for each row, i is the row#, mv points to the beginning of the row of M. If we take the whole col, take it in order for cache. Prefetch next row?
763
-
#defineCOLLPE }while(++bvgrd!=bvgrde);
764
762
do{
763
+
// start of processing one column
765
764
Icolx=*ndx; // get next column# to work on
766
765
Ilimitrow; // the best row to use as a pivot for this column; or # qualifying Dpiv found.
#defineCOLLPINIT I *bvgrd=bvgrd0; I i=-1; D *mv=mv0-n; D bkold=inf, cold=1.0; I bkle0=1;
778
+
#defineCOLLP do{if(unlikely(zv!=0)){++i; mv+=n;}else{i=*bvgrd; mv=mv0+n*i;} // for each row, i is the row#, mv points to the beginning of the row of M. If we take the whole col, take it in order for cache. Prefetch next row?
779
+
#defineCOLLPE }while(++bvgrd!=bvgrde);
775
780
COLLPINIT
776
781
// if the column is just to be fetched from M, do so without dot-product. We can use gather down the column, but there's no gain
777
782
__m256ddotprod; // place where product is assembled or read into
// 128!:9 matrix times sparse vector with optional early exit
954
960
// product mode:
955
-
// y is ndx;Ax;Am;Av;(M, shape m,n) where ndx is an atom
956
-
// if ndx<m, the column is ndx {"1 M; otherwise ((((ndx-m){Ax) ];.0 Am) {"1 M) +/@:*"1 ((ndx-m){Ax) ];.0 Av
957
-
// Result for product mode (exitvec is scalar) is the product
958
-
// DIP mode
961
+
// y is ndx;Ax;Am;Av;(M, shape m,n) where ndx is an atom
962
+
// if ndx<m, the column is ndx {"1 M; otherwise ((((ndx-m){Ax) ];.0 Am) {"1 M) +/@:*"1 ((ndx-m){Ax) ];.0 Av
963
+
// if M has rank 3 (with 2={.$M), do the product in extended precision
964
+
// Result for product mode (exitvec is scalar) is the product, one column of M
965
+
// DIP/Dpiv mode:
959
966
// y is ndx;Ax;Am;Av;(M, shape m,n);bkgrd;(ColThreshold/PivTol,MinPivot,bkmin,NFreeCols,NCols,ImpFac,Virtx/Dpivdir);bk/'';Frow[;exclusion list/Dpiv;Yk]
960
967
// Result is rc,best row,best col,#cols scanned,#dot-products evaluated,best gain (if rc e. 0 1 2)
961
968
// rc,failing column of NTT, an element of ndx (if rc=4)
// single index value. set bv=0, zv non0 as a flag that we are storing the column
1019
1026
bv=0; ASSERT(AN(w)==5,EVLENGTH); // if goodvec is an atom, set bv=0 to indicate that bv is not used and verify no more input
1020
-
if(unlikely(n==0)){Rreshape(sc(n),zeroionei(0));} // empty M, each product is 0
1021
-
GATV0(z,FL,n,1); zv=DAV(z); // allocate the result area for column extraction. Set zv nonzero so we use bkgrd of i. #M
1022
-
bvgrd0=0; bvgrde=bvgrd0+AS(C(AAV(w)[4]))[0]; // length of column is #M
1027
+
if(unlikely(n==0)){Rreshape(drop(num(-1),shape(C(AAV(w)[4]))),zeroionei(0));} // empty M, each product is 0
1028
+
Iepcol=AR(C(AAV(w)[4]))==3; // flag if we are doing an extended-precision column fetch
1029
+
GATV(z,FL,n<<epcol,1+epcol,AS(C(AAV(w)[4]))); zv=DAV(z); // allocate the result area for column extraction. Set zv nonzero so we use bkgrd of i. #M
1030
+
bvgrd0=0; bvgrde=bvgrd0+n; // length of column is #M
1023
1031
}else{
1024
1032
// A list of index values. We are doing the DIP calculation or Dpiv
1025
1033
ASSERT(AR(C(AAV(w)[5]))==1,EVRANK); ASSERT(AN(C(AAV(w)[5]))==0||AT(C(AAV(w)[5]))&INT,EVDOMAIN); bvgrd0=IAV(C(AAV(w)[5])); bvgrde=bvgrd0+AN(C(AAV(w)[5])); // bkgrd: the order of processing the rows, and end+1 ptr normally /: bk
D*qkv=DAV(qk); Iqksize=AS(qk)[0]; Iqksizesq=qksize*qksize; dpflag|=AR(qk)>2; // pointer to qk data, length of a row, offset to low part if present
1123
+
D*qkv=DAV(qk); Iqksize=AS(qk)[AR(qk)-1]; Iqksizesq=qksize*qksize; dpflag|=AR(qk)>2; // pointer to qk data, length of a row, offset to low part if present
1116
1124
UIrowx=0, rown=AN(prx); I*rowxv=IAV(prx); D*pcn0v=DAV(pivotcolnon0); dpflag|=(AR(pivotcolnon0)>1)<<1; // current row, # rows, address of row indexes, column data
1117
1125
UIcoln=AN(pcx); I*colxv=IAV(pcx); D*prn0v=DAV(newrownon0); dpflag|=(AR(newrownon0)>1)<<2; // # cols, address of col indexes. row data
prodh=_mm256_xor_pd(_mm256_fmsub_pd(relfuzzcct,prodh,qkvh),_mm256_fmsub_pd(relfuzzcct,qkvh,prodh)); // sets sign of prodh if fuzzy ne, means keep the result
1152
+
// create max(abs(qkvh),abs(pcoldh*prowdh)) which will go into threshold calc
// Do high-precision add of qkvh and iph. If this decreases the absvalue of qkvh, we will lose precision because of insufficient
1162
-
// bits of qkv. If this increases the absvalue of qkvh, all of qkvl will contribute and the limit of validity will be
1163
-
// from the product. In either case it is safe to accumulate all the partial products and ipl into qkvl
1164
-
qkvl=_mm256_sub_pd(qkvl,ipl); qkvl=_mm256_fnmadd_pd(prowdh,pcoldl,qkvl); qkvl=_mm256_fnmadd_pd(prowdl,pcoldh,qkvl); // the middle pps. low*low will never contribute unless qkv is exhausted & thus noise
1165
-
TWOSUM(qkvh,iph,qkvh,isl) // combine the high parts
1166
-
isl=_mm256_add_pd(isl,qkvl); // add the combined low parts
// Do high-precision add of qkvh and iph. If this decreases the absvalue of qkvh, we will lose precision because of insufficient
1177
+
// bits of qkv. If this increases the absvalue of qkvh, all of qkvl will contribute and the limit of validity will be
1178
+
// from the product. In either case it is safe to accumulate all the partial products and ipl into qkvl
1179
+
qkvl=_mm256_sub_pd(qkvl,ipl); qkvl=_mm256_fnmadd_pd(prowdh,pcoldl,qkvl); qkvl=_mm256_fnmadd_pd(prowdl,pcoldh,qkvl); // the middle pps. low*low will never contribute unless qkv is exhausted & thus noise
1180
+
TWOSUM(qkvh,iph,qkvh,isl) // combine the high parts
1181
+
isl=_mm256_add_pd(isl,qkvl); // add the combined low parts
1182
+
// Make sure qkvl is much less than qkvh
1183
+
TWOSUM(qkvh,isl,qkvh,qkvl) // put pkvh into canonical form
1184
+
// convert maxabs to abs(qkvh) - maxabs*thresh: if < 0, means result should be forced to 0
ASSERT(AR(newrownon0)==1||AS(newrownon0)[0]==2, EVLENGTH) // newrownon0 is float or extended list
1210
1228
Atmp=AAV(a)[4]; if(!(AT(tmp)&FL))RZ(tmp=cvt(FL,tmp)); ASSERT(AR(tmp)==0,EVRANK) Drelfuzz=DAV(tmp)[0]; // relfuzz is a float atom
1211
1229
// agreement
1212
-
ASSERT(AN(prx)==AN(pivotcolnon0),EVLENGTH) ASSERT(AN(pcx)==AN(newrownon0),EVLENGTH) // indexes and values must agree
1230
+
ASSERT(AN(prx)==AS(pivotcolnon0)[AR(pivotcolnon0)-1],EVLENGTH) ASSERT(AN(pcx)==AS(newrownon0)[AR(newrownon0)-1],EVLENGTH) // indexes and values must agree
0 commit comments