Skip to content

Commit 3f83076

Browse files
committed
WIP mm256_testc
1 parent 1619d02 commit 3f83076

3 files changed

Lines changed: 121 additions & 18 deletions

File tree

jsrc/j.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,14 +1902,18 @@ if(likely(type _i<3)){z=(I)&oneone; z=type _i>1?(I)_zzt:z; _zzt=type _i<1?(I*)z:
19021902
#define LGSZS 1 // lg (bytes in an S)
19031903

19041904
#if (C_AVX2&&SY_64) || EMU_AVX2
1905-
// create double-precision product of inputs
1905+
// create double-precision product of inputs. outhi must not be an input; outlo can
19061906
#define TWOPROD(in0,in1,outhi,outlo) outhi=_mm256_mul_pd(in0,in1); outlo=_mm256_fmsub_pd(in0,in1,outhi);
1907-
// create double-precision sum of inputs, where it is not known which is larger NOTE in0 and outhi might be identical. Needs sgnbit.
1907+
// create double-precision sum of inputs, where it is not known which is larger NOTE in0 and outhi might be identical. outlo must not be an input. Needs sgnbit.
19081908
#define TWOSUM(in0,in1,outhi,outlo) {__m256d t=_mm256_andnot_pd(sgnbit,in0); outlo=_mm256_andnot_pd(sgnbit,in1); t=_mm256_sub_pd(t,outlo); \
19091909
outlo=_mm256_blendv_pd(in0,in1,t); t=_mm256_blendv_pd(in1,in0,t); /* outlo=val with larger abs t=val with smaller abs */ \
19101910
outhi=_mm256_add_pd(in0,in1); /* single-prec sum */ \
19111911
outlo=_mm256_sub_pd(outlo,outhi); /* big-(big+small): implied val of -small after rounding */ \
19121912
outlo=_mm256_add_pd(outlo,t);} // amt by which actual value exceeds implied: this is the lost low precision
1913+
// Same, but we know which argument is bigger. outhi cannot be an input; outlo can be the same as inbig
1914+
#define TWOSUMBS(inbig,insmall,outhi,outlo) {outhi=_mm256_add_pd(inbig,insmall); /* single-prec sum */ \
1915+
outlo=_mm256_sub_pd(inbig,outhi); /* big-(big+small): implied val of -small after rounding */ \
1916+
outlo=_mm256_add_pd(outlo,insmall);} // amt by which actual value exceeds implied: this is the lost low precision
19131917
#define DPADD(hi0,lo0,hi1,lo1,outhi,outlo) outhi=_mm256_add_pd(hi0,hi1); outlo=_mm256_add_pd(lo0,lo1);
19141918
#else
19151919
#define TWOSPLIT(a,x,y) y=(a)*134217730.0; x=y-(a); x=y-x; y=(a)-x; // must avoid compiler tuning

jsrc/v1.c

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ static B eqv(I af,I wf,I m,I n,I k,C* RESTRICT av,C* RESTRICT wv,B* RESTRICT z,B
9898
// fetch the load mask for the last block: the words to load, including any trailing fragment
9999
// step up to qword boundary
100100
I *x=(I*)((C*)av+((k-1)&(SZI-1))+1), *y=(I*)((C*)wv+((k-1)&(SZI-1))+1); // access the arguments as Is
101-
__m256i allmatches =_mm256_cmpeq_epi8(endmask,endmask); // accumuland for compares init to all 1
101+
__m256i allmatches =_mm256_cmpeq_epi8(endmask,endmask),ones=allmatches; // accumuland for compares init to all 1
102102
b=b1; // init store value to compare failure
103103
if(n2>0){
104104
UI i = n2; // inner loop size
@@ -115,13 +115,15 @@ static B eqv(I af,I wf,I m,I n,I k,C* RESTRICT av,C* RESTRICT wv,B* RESTRICT z,B
115115
case -8: u=_mm256_loadu_si256 ((__m256i*)(x+7*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+7*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
116116
x+=8*NPAR; y+=8*NPAR;
117117
if(n2==1)goto oneloop; // if we don't have to loop here, avoid the data-dependent branch and fold the comparisons into the last batch
118-
if(~_mm256_movemask_epi8(allmatches))goto fail; // if searches are long, kick out when there is a miscompare
118+
// obsolete if(~_mm256_movemask_epi8(allmatches))goto fail; // if searches are long, kick out when there is a miscompare
119+
if(!_mm256_testc_si256(allmatches,ones))goto fail; // if searches are long, kick out when there is a miscompare. test is '!(all bits of allmatches =1)'
119120
}while(--i>0);
120121
}
121122
oneloop:;
122123
}
123124
u=_mm256_maskload_epi64(x,endmask); v=_mm256_maskload_epi64(y,endmask);
124-
b ^= 0==~_mm256_movemask_epi8(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v))); // no miscompares, switch failure value to success
125+
b ^= _mm256_testc_si256(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v)),ones); // no miscompares, switch failure value to success. test 1=good
126+
// obsolete b ^= 0==~_mm256_movemask_epi8(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v))); // no miscompares, switch failure value to success
125127
fail:
126128
*z++=b; // store one result
127129
wv += k; av+= ka; // advance w always, and a if original m was 1
@@ -144,10 +146,11 @@ I memcmpne(void *s, void *t, I l){
144146
I n=(l-1)>>LGSZI; // number of Ds to process - cannot be 0
145147
__m256i u,v;
146148
__m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-n)&(NPAR-1)))); // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001
149+
__m256i ones=_mm256_cmpeq_epi8(endmask,endmask);
147150

148151
UI n2=DUFFLPCT(n-1,3); /* # turns through duff loop */
149152
if(n2>0){
150-
__m256i allmatches =_mm256_cmpeq_epi8(endmask,endmask); // accumuland for compares init to all 1
153+
__m256i allmatches =ones; // accumuland for compares init to all 1
151154
UI backoff=DUFFBACKOFF(n-1,3);
152155
x+=(backoff+1)*NPAR; y+=(backoff+1)*NPAR;
153156
switch(backoff){
@@ -161,13 +164,15 @@ I memcmpne(void *s, void *t, I l){
161164
case -7: u=_mm256_loadu_si256 ((__m256i*)(x+6*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+6*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
162165
case -8: u=_mm256_loadu_si256 ((__m256i*)(x+7*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+7*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
163166
x+=8*NPAR; y+=8*NPAR;
164-
if(~_mm256_movemask_epi8(allmatches))R 1;
167+
// obsolete if(~_mm256_movemask_epi8(allmatches))R 1;
168+
if(!_mm256_testc_si256(allmatches,ones))R 1; // test is '!(all bits of allmatches=1)'
165169
}while(--n2>0);
166170
}
167171
}
168172

169173
u=_mm256_maskload_epi64(x,endmask); v=_mm256_maskload_epi64(y,endmask);
170-
R 0!=~_mm256_movemask_epi8(_mm256_cmpeq_epi8(u,v)); // no miscompares, compare equal
174+
R !_mm256_testc_si256(_mm256_cmpeq_epi8(u,v),ones); // return 1 if any mismatch
175+
// obsolete R 0!=~_mm256_movemask_epi8(_mm256_cmpeq_epi8(u,v)); // no miscompares, compare equal
171176
}
172177

173178
// memcmpnefl: test for inequality, not caring about order, for float inputs, possibly with tolerance
@@ -180,6 +185,7 @@ I memcmpnefl(void *s, void *t, I l, J jt){
180185
D *x=s, *y=t; // access the arguments as doubles
181186
__m256d u,v;
182187
__m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-l)&(NPAR-1)))); // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001
188+
__m256d ones=_mm256_castsi256_pd(_mm256_cmpeq_epi8(endmask,endmask));
183189
if(jt->cct==1.0){
184190
// intolerant comparison
185191
UI n2=DUFFLPCT(l-1,3); /* # turns through duff loop */
@@ -198,24 +204,28 @@ I memcmpnefl(void *s, void *t, I l, J jt){
198204
case -7: u=_mm256_loadu_pd(x+6*NPAR); v=_mm256_loadu_pd(y+6*NPAR); allmatches=_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ));
199205
case -8: u=_mm256_loadu_pd(x+7*NPAR); v=_mm256_loadu_pd(y+7*NPAR); allmatches=_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ));
200206
x+=8*NPAR; y+=8*NPAR;
201-
if(0xf!=_mm256_movemask_pd(allmatches))R 1;
207+
// obsolete if(0xf!=_mm256_movemask_pd(allmatches))R 1;
208+
if(!_mm256_testc_pd(allmatches,ones))R 1; // test is '!(all bits of allmatches=1)'
202209
}while(--n2>0);
203210
}
204211
}
205212
u=_mm256_maskload_pd(x,endmask); v=_mm256_maskload_pd(y,endmask);
206-
R 0xf!=_mm256_movemask_pd(_mm256_cmp_pd(u,v,_CMP_EQ_OQ)); // no miscompares, compare equal
213+
// obsolete R 0xf!=_mm256_movemask_pd(_mm256_cmp_pd(u,v,_CMP_EQ_OQ)); // no miscompares, compare equal
214+
R !_mm256_testc_pd(_mm256_cmp_pd(u,v,_CMP_EQ_OQ),ones); // return 1 if any mismatch
207215
}
208216
UI i=(l-1)>>LGNPAR; /* # loops for 0 1 2 3 4 5 is x 0 0 0 0 1 */
209217
// tolerant comparison
210218
__m256d cct=_mm256_broadcast_sd(&jt->cct);
211219
if(i){
212220
do{
213221
u=_mm256_loadu_pd(x); v=_mm256_loadu_pd(y); x+=NPAR; y+=NPAR;
214-
if(0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ))))R 1;
222+
// obsolete if(0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ))))R 1;
223+
if(!_mm256_testc_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)),ones))R 1;
215224
}while(--i>0);
216225
}
217226
u=_mm256_maskload_pd(x,endmask); v=_mm256_maskload_pd(y,endmask);
218-
R 0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)));
227+
// obsolete R 0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)));
228+
R !_mm256_testc_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)),ones);
219229
}
220230

221231
static B eqvfl(I af,I wf,I m,I n,I k,D* RESTRICT av,D* RESTRICT wv,B* RESTRICT z,B b1,J jt){
@@ -228,6 +238,7 @@ static B eqvfl(I af,I wf,I m,I n,I k,D* RESTRICT av,D* RESTRICT wv,B* RESTRICT z
228238
__m256d u,v;
229239
// prep for each compare loop
230240
__m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-k)&(NPAR-1)))); // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001
241+
__m256d ones=_mm256_castsi256_pd(_mm256_cmpeq_epi8(endmask,endmask));
231242
UI n2=DUFFLPCT(k-1,3); /* # turns through duff loop */
232243
UI backoff=DUFFBACKOFF(k-1,3);
233244
UI i0=(k-1)>>LGNPAR; /* # loops for 0 1 2 3 4 5 is x 0 0 0 0 1 used for tolerant */
@@ -260,25 +271,29 @@ static B eqvfl(I af,I wf,I m,I n,I k,D* RESTRICT av,D* RESTRICT wv,B* RESTRICT z
260271
case -8: u=_mm256_loadu_pd(x+7*NPAR); v=_mm256_loadu_pd(y+7*NPAR); allmatches=_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ));
261272
x+=8*NPAR; y+=8*NPAR;
262273
if(n2==1)goto oneloop; // if we don't have to loop here, avoid the data-dependent branch and fold the comparisons into the last batch
263-
if(0xf!=_mm256_movemask_pd(allmatches))goto fail;
274+
// obsolete if(0xf!=_mm256_movemask_pd(allmatches))goto fail;
275+
if(!_mm256_testc_pd(allmatches,ones))goto fail; // test is '!(all bits of allmatches=1)'
264276
}while(--i>0);
265277
}
266278
}
267279
oneloop:
268280
u=_mm256_maskload_pd(x,endmask); v=_mm256_maskload_pd(y,endmask);
269-
b ^= 0xf==_mm256_movemask_pd(_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ))); // no miscompares, compare equal
281+
// obsolete b ^= 0xf==_mm256_movemask_pd(_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ))); // no miscompares, compare equal
282+
b ^= _mm256_testc_pd(_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ)),ones); // no miscompares, switch failure value to success. test 1=good
270283
}else{
271284
// tolerant comparison
272285
__m256d cct=_mm256_broadcast_sd(&jt->cct);
273286
if(i0){
274287
UI i = i0; // inner loop size
275288
do{ // unfortunately it's probably not worth checking for lengths 5-8 & we will have a misbranch whenever length > 4
276289
u=_mm256_loadu_pd(x); v=_mm256_loadu_pd(y); x+=NPAR; y+=NPAR;
277-
if(0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ))))goto fail;
290+
// obsolete if(0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ))))goto fail;
291+
if(!_mm256_testc_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)),ones))goto fail;
278292
}while(--i>0);
279293
}
280294
u=_mm256_maskload_pd(x,endmask); v=_mm256_maskload_pd(y,endmask);
281-
b ^= 0xf==_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)));
295+
// obsolete b ^= 0xf==_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)));
296+
b ^= _mm256_testc_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)),ones);
282297
}
283298

284299
fail:

jsrc/vfrom.c

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,91 @@ static unsigned char jtmvmsparsex(J jt,void* const ctx,UI4 ti){
772772
limitrow=-1; // init no eligible row found
773773
}else limitrow=0; // for Dpiv, init to none found
774774
#if 0
775-
// process the column NPAR values at a time
775+
I epcol=AR(qk)==3; // flag if we are doing an extended-precision column fetch scaf move out of loop, and into limitrow
776+
__m256d sgnbit=_mm256_broadcast_sd((D*)&Iimin); // scaf move out of loop
777+
__m256i rowstride=_mm256_set1_epi64x(n); // number of Ds in a row of M, in each lane
778+
// init for the column
779+
__m256i endmask=_mm256_cmpeq_epi64(sgnbit,sgnbit); /* length mask for the last word */
780+
__m256i indexes, rownums; // row numbers and atom indexes for the rows we are fetching
781+
I *bvgrd=bvgrd0; I i=-1; D bkold=inf, cold=1.0; I bkle0=1;
782+
if(bv==0&&zv!=0){ // one-column mode: gather offsets address successive rows
783+
..
784+
}
785+
// create the column NPAR values at a time
786+
do{
787+
__m256i indexes; // indexes for the rows we are fetching
788+
__m256d dotproducth,dotproductl; // where we build the column value
789+
// get the validity mask for the gather and process: leave as ones until the end of the column
790+
if(unlikely(bvgrde-bvgrd<NPAR))endmask = _mm256_loadu_si256((__m256i*)(validitymask+NPAR-(bvgrde-bvgrd))); /* mask for 00=1111, 01=1000, 10=1100, 11=1110 */
791+
// if DIP/Dpiv, fetch the row#s of the next group to process, in bkg order
792+
if(bv!=0||zv==0){
793+
..
794+
}else{
795+
rownums=_mm256_add_epi64(rownums,_mm256_sll_epi64(rowstride,LGNPAR)); // advance NPAR rows to next sequential values
796+
}
797+
indexes=_mm256_mul_epu32(rowstride,rownums);
798+
// Now mv and indexes are set up to read the correct rows
799+
// get the next NPAR values by dot-product with Av
800+
if(colx<n){
801+
// fetching from the Ek matrix itself. Just fetch the values from the column
802+
dotproducth=_mm256_setzero_pd(); dotproducth=_mm256_mask_i64gather_pd(qkvh,mv0+colx,indexes,endmask,SZI);
803+
if(epcol){dotproductl=_mm256_setzero_pd(); dotproductl=_mm256_mask_i64gather_pd(qkvh,mv0+n*n+colx,indexes,endmask,SZI);}
804+
}else{
805+
// fetching from A. Form Ek row . A column for each of the 4 rows
806+
I an=axv[colx][1]; // number of sparse atoms in each row
807+
D *vv=avv0+axv[colx][0]; // pointer to values for this section of A
808+
I *iv=amv0+axv[colx][0]; // pointer to row numbers of the values in *vv (these are the columns we fetch in turn from Ek)
809+
if(likely(!epcol)){
810+
// single-precision accumulate
811+
dotproducth=_mm256_setzero_pd();
812+
I k;
813+
NOUNROLL for(k=0;k<an;++k){
814+
dotproductl=_mm256_setzero_pd(); dotproductl=_mm256_mask_i64gather_pd(qkvh,mv0+iv[k],indexes,endmask,SZI); // fetch from up to 4 rows
815+
dotproducth=_mm256_fmadd_pd(dotproductl,_mm256_set1_pd(vv[k]),dotproducth); // accumulate the dot-product
816+
}
817+
}else{
818+
// extended-precision accumulate
819+
__m256d th,tl,tl2,vval; // temps for value loaded, and multiplier from A column
820+
if(likely(an!=0){
821+
// get column number to fetch; fetch 4 rows
822+
th=_mm256_setzero_pd(); th=_mm256_mask_i64gather_pd(qkvh,mv0+iv[0],indexes,endmask,SZI); // fetch from up to 4 rows
823+
tl=_mm256_setzero_pd(); tl=_mm256_mask_i64gather_pd(qkvh,mv0+n*n+iv[0],indexes,endmask,SZI); // fetch from up to 4 rows
824+
vval=_mm256_set1_pd(vv[0]); // load column value
825+
// initialize the dotproduct with the first product
826+
TWOPROD(th,vval,dotproducth,tl2) // high qk * col
827+
tl2=_mm256_fmadd_pd(tl,vval,tl2); // low qk*col, and add in extension of prev product
828+
TWOADD(dotproducth,tl2,dotproducth,dotproductl) // combine high & low
829+
I k;
830+
for(k=1;k<an;++k){ // for each other element of the dot-product
831+
// get column number to fetch; fetch 4 rows
832+
th=_mm256_setzero_pd(); th=_mm256_mask_i64gather_pd(qkvh,mv0+iv[k],indexes,endmask,SZI); // fetch from up to 4 rows
833+
tl=_mm256_setzero_pd(); tl=_mm256_mask_i64gather_pd(qkvh,mv0+n*n+iv[k],indexes,endmask,SZI); // fetch from up to 4 rows
834+
vval=_mm256_set1_pd(vv[0]); // load column value
835+
// accumulate the dot-product
836+
TWOPROD(th,vval,th,tl2) // high qk * col
837+
tl2=_mm256_fmadd_pd(tl,vval,tl2); // low qk*col, and add in extension of prev product
838+
TWOADD(dotproducth,th,dotproducth,vval) tl2=_mm256_add_pd(vval,tl2); // add high parts & accum extension
839+
TWOADD(dotproducth,tl2,dotproducth,dotproductl) // combine high & extension for final form
840+
}
841+
}else{dotproducth=dotproductl=_mm256_setzero_pd();} // no columns should not occur. Just 1 shouldn't either
842+
}
843+
}
844+
// process the NPAR generated values
845+
if(bv==0&&zv!=0){
846+
// one-column mode: just store out the values
847+
..
848+
}else{
849+
// DIP/Dpiv mode: process each value in turn. Since 0 values are never pivots, we can stop when all remaining values are 0
850+
indexes=rownums; // repurpose indexes to hold the row-number we are working on
851+
while(_mm256_movemask_pd(_mm256_cmp_pd(dotproducth,_mm256_setzero_pd(),_CMP_EQ_OQ))!=0xf)){
852+
dotprod=_mm256_permute4x64_pd(dotproducth,0b00000000); // copy next value into all lanes
853+
dotproducth=_mm256_permute4x64_pd(dotproducth,0b11111001); dotproducth=_mm256_blend_pd(dotproducth,_mm256_setzero_pd(),0b1000) // shift down one value for next time
854+
i=_mm256_extract_epi64(indexes,0); indexes=_mm256_permute4x64_pd(indexes,0b11111001); // get the row number we are trying to swap out; shift row number down for next loop
855+
PROCESSROWRATIOS
856+
}
857+
}
858+
}
859+
}while((bvgrd+=NPAR)<bvgrde);
776860
#else
777861
#define COLLPINIT I *bvgrd=bvgrd0; I i=-1; D *mv=mv0-n; D bkold=inf, cold=1.0; I bkle0=1;
778862
#define COLLP do{if(unlikely(zv!=0)){++i; mv+=n;}else{i=*bvgrd; mv=mv0+n*i;} // for each row, i is the row#, mv points to the beginning of the row of M. If we take the whole col, take it in order for cache. Prefetch next row?
@@ -970,7 +1054,7 @@ static unsigned char jtmvmsparsex(J jt,void* const ctx,UI4 ti){
9701054
// rc=5 (not created - means problem is infeasible) rc=6=empty M, problem is malformed
9711055
// if the exclusion list is given, we stop on the first nonimproving pivot, and the exclusion list is used to prevent repetition of basis
9721056
// If Frow is empty, we are looking for nonimproving pivots in rows where the selector is 0. In that case the bkgrd puts the bk values in descending order. We return the first column that will make more 0 B rows non0 than non0 B rows 0.
973-
// If bk is empty, we are counting the #places where c>=PivTol and accumulating into Dpiv under control of Dpivdir (-1=decr, 1=incr; init to 0 if neg)
1057+
// If bk is empty, we are looking in bkgrd columns and counting the #places where c>=PivTol and accumulating into Dpiv under control of Dpivdir (-1=decr, 1=incr; init to 0 if neg)
9741058
// Rank is infinite
9751059
F1(jtmvmsparse){PROLOG(832);
9761060
#if C_AVX2

0 commit comments

Comments
 (0)