WIP mm256_testc

HenryHRich · HenryHRich · commit 3f83076a5aac · 2022-09-05T11:57:36.000-04:00
diff --git a/jsrc/j.h b/jsrc/j.h
@@ -1902,14 +1902,18 @@ if(likely(type _i<3)){z=(I)&oneone; z=type _i>1?(I)_zzt:z; _zzt=type _i<1?(I*)z:
 #define LGSZS   1  // lg (bytes in an S)
 
 #if (C_AVX2&&SY_64) || EMU_AVX2
-// create double-precision product of inputs
+// create double-precision product of inputs.  outhi must not be an input; outlo can
 #define TWOPROD(in0,in1,outhi,outlo) outhi=_mm256_mul_pd(in0,in1); outlo=_mm256_fmsub_pd(in0,in1,outhi);
-// create double-precision sum of inputs, where it is not known which is larger  NOTE in0 and outhi might be identical.  Needs sgnbit.
+// create double-precision sum of inputs, where it is not known which is larger  NOTE in0 and outhi might be identical.  outlo must not be an input.  Needs sgnbit.
 #define TWOSUM(in0,in1,outhi,outlo) {__m256d t=_mm256_andnot_pd(sgnbit,in0); outlo=_mm256_andnot_pd(sgnbit,in1); t=_mm256_sub_pd(t,outlo); \
                                     outlo=_mm256_blendv_pd(in0,in1,t); t=_mm256_blendv_pd(in1,in0,t); /* outlo=val with larger abs t=val with smaller abs */ \
                                     outhi=_mm256_add_pd(in0,in1); /* single-prec sum */ \
                                     outlo=_mm256_sub_pd(outlo,outhi); /* big-(big+small): implied val of -small after rounding */ \
                                     outlo=_mm256_add_pd(outlo,t);}  // amt by which actual value exceeds implied: this is the lost low precision
+// Same, but we know which argument is bigger.  outhi cannot be an input; outlo can be the same as inbig
+#define TWOSUMBS(inbig,insmall,outhi,outlo) {outhi=_mm256_add_pd(inbig,insmall); /* single-prec sum */ \
+                                    outlo=_mm256_sub_pd(inbig,outhi); /* big-(big+small): implied val of -small after rounding */ \
+                                    outlo=_mm256_add_pd(outlo,insmall);}  // amt by which actual value exceeds implied: this is the lost low precision
 #define DPADD(hi0,lo0,hi1,lo1,outhi,outlo)  outhi=_mm256_add_pd(hi0,hi1); outlo=_mm256_add_pd(lo0,lo1);
 #else
 #define TWOSPLIT(a,x,y) y=(a)*134217730.0; x=y-(a); x=y-x; y=(a)-x;   // must avoid compiler tuning
diff --git a/jsrc/v1.c b/jsrc/v1.c
@@ -98,7 +98,7 @@ static B eqv(I af,I wf,I m,I n,I k,C* RESTRICT av,C* RESTRICT wv,B* RESTRICT z,B
   // fetch the load mask for the last block: the words to load, including any trailing fragment
    // step up to qword boundary
    I *x=(I*)((C*)av+((k-1)&(SZI-1))+1), *y=(I*)((C*)wv+((k-1)&(SZI-1))+1);  // access the arguments as Is
-   __m256i allmatches =_mm256_cmpeq_epi8(endmask,endmask); // accumuland for compares init to all 1
+   __m256i allmatches =_mm256_cmpeq_epi8(endmask,endmask),ones=allmatches; // accumuland for compares init to all 1
    b=b1;  // init store value to compare failure
    if(n2>0){
     UI i = n2;  // inner loop size
@@ -115,13 +115,15 @@ static B eqv(I af,I wf,I m,I n,I k,C* RESTRICT av,C* RESTRICT wv,B* RESTRICT z,B
     case -8: u=_mm256_loadu_si256 ((__m256i*)(x+7*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+7*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
     x+=8*NPAR; y+=8*NPAR;
     if(n2==1)goto oneloop;  // if we don't have to loop here, avoid the data-dependent branch and fold the comparisons into the last batch 
-    if(~_mm256_movemask_epi8(allmatches))goto fail;  // if searches are long, kick out when there is a miscompare
+// obsolete     if(~_mm256_movemask_epi8(allmatches))goto fail;  // if searches are long, kick out when there is a miscompare
+     if(!_mm256_testc_si256(allmatches,ones))goto fail;  // if searches are long, kick out when there is a miscompare.  test is '!(all bits of allmatches =1)'
     }while(--i>0);
     }
 oneloop:;
    }
    u=_mm256_maskload_epi64(x,endmask); v=_mm256_maskload_epi64(y,endmask); 
-   b ^= 0==~_mm256_movemask_epi8(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v)));  // no miscompares, switch failure value to success
+   b ^= _mm256_testc_si256(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v)),ones);  // no miscompares, switch failure value to success.  test 1=good
+// obsolete    b ^= 0==~_mm256_movemask_epi8(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v)));  // no miscompares, switch failure value to success
 fail:
    *z++=b;  // store one result
    wv += k; av+= ka;  // advance w always, and a if original m was 1
@@ -144,10 +146,11 @@ I memcmpne(void *s, void *t, I l){
  I n=(l-1)>>LGSZI;  // number of Ds to process - cannot be 0
  __m256i u,v;
  __m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-n)&(NPAR-1))));  // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001
+ __m256i ones=_mm256_cmpeq_epi8(endmask,endmask);
 
  UI n2=DUFFLPCT(n-1,3);  /* # turns through duff loop */
  if(n2>0){
-  __m256i allmatches =_mm256_cmpeq_epi8(endmask,endmask); // accumuland for compares init to all 1
+  __m256i allmatches =ones; // accumuland for compares init to all 1
   UI backoff=DUFFBACKOFF(n-1,3);
   x+=(backoff+1)*NPAR; y+=(backoff+1)*NPAR;
   switch(backoff){
@@ -161,13 +164,15 @@ I memcmpne(void *s, void *t, I l){
   case -7: u=_mm256_loadu_si256 ((__m256i*)(x+6*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+6*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
   case -8: u=_mm256_loadu_si256 ((__m256i*)(x+7*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+7*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
   x+=8*NPAR; y+=8*NPAR;
-  if(~_mm256_movemask_epi8(allmatches))R 1;
+// obsolete   if(~_mm256_movemask_epi8(allmatches))R 1;
+   if(!_mm256_testc_si256(allmatches,ones))R 1;  // test is '!(all bits of allmatches=1)'
   }while(--n2>0);
   }
  }
 
  u=_mm256_maskload_epi64(x,endmask); v=_mm256_maskload_epi64(y,endmask); 
- R 0!=~_mm256_movemask_epi8(_mm256_cmpeq_epi8(u,v));  // no miscompares, compare equal
+ R !_mm256_testc_si256(_mm256_cmpeq_epi8(u,v),ones);  // return 1 if any mismatch
+// obsolete  R 0!=~_mm256_movemask_epi8(_mm256_cmpeq_epi8(u,v));  // no miscompares, compare equal
 }
 
 // memcmpnefl: test for inequality, not caring about order, for float inputs, possibly with tolerance
@@ -180,6 +185,7 @@ I memcmpnefl(void *s, void *t, I l, J jt){
  D *x=s, *y=t;  // access the arguments as doubles
  __m256d u,v;
  __m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-l)&(NPAR-1))));  // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001
+ __m256d ones=_mm256_castsi256_pd(_mm256_cmpeq_epi8(endmask,endmask));
  if(jt->cct==1.0){
   // intolerant comparison
   UI n2=DUFFLPCT(l-1,3);  /* # turns through duff loop */
@@ -198,24 +204,28 @@ I memcmpnefl(void *s, void *t, I l, J jt){
    case -7: u=_mm256_loadu_pd(x+6*NPAR); v=_mm256_loadu_pd(y+6*NPAR); allmatches=_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ));
    case -8: u=_mm256_loadu_pd(x+7*NPAR); v=_mm256_loadu_pd(y+7*NPAR); allmatches=_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ));
    x+=8*NPAR; y+=8*NPAR;
-   if(0xf!=_mm256_movemask_pd(allmatches))R 1;
+// obsolete    if(0xf!=_mm256_movemask_pd(allmatches))R 1;
+    if(!_mm256_testc_pd(allmatches,ones))R 1;  // test is '!(all bits of allmatches=1)'
    }while(--n2>0);
    }
   }
   u=_mm256_maskload_pd(x,endmask); v=_mm256_maskload_pd(y,endmask); 
-  R 0xf!=_mm256_movemask_pd(_mm256_cmp_pd(u,v,_CMP_EQ_OQ));  // no miscompares, compare equal
+// obsolete   R 0xf!=_mm256_movemask_pd(_mm256_cmp_pd(u,v,_CMP_EQ_OQ));  // no miscompares, compare equal
+  R !_mm256_testc_pd(_mm256_cmp_pd(u,v,_CMP_EQ_OQ),ones);   // return 1 if any mismatch
  }
  UI i=(l-1)>>LGNPAR;  /* # loops for 0 1 2 3 4 5 is x 0 0 0 0 1 */
  // tolerant comparison
  __m256d cct=_mm256_broadcast_sd(&jt->cct);
  if(i){
   do{
    u=_mm256_loadu_pd(x); v=_mm256_loadu_pd(y); x+=NPAR; y+=NPAR;
-   if(0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ))))R 1;
+// obsolete    if(0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ))))R 1;
+   if(!_mm256_testc_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)),ones))R 1;
   }while(--i>0);
  }
  u=_mm256_maskload_pd(x,endmask); v=_mm256_maskload_pd(y,endmask); 
- R 0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)));
+// obsolete  R 0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)));
+ R !_mm256_testc_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)),ones);
 }
 
 static B eqvfl(I af,I wf,I m,I n,I k,D* RESTRICT av,D* RESTRICT wv,B* RESTRICT z,B b1,J jt){
@@ -228,6 +238,7 @@ static B eqvfl(I af,I wf,I m,I n,I k,D* RESTRICT av,D* RESTRICT wv,B* RESTRICT z
  __m256d u,v;
  // prep for each compare loop
  __m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-k)&(NPAR-1))));  // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001
+ __m256d ones=_mm256_castsi256_pd(_mm256_cmpeq_epi8(endmask,endmask));
  UI n2=DUFFLPCT(k-1,3);  /* # turns through duff loop */
  UI backoff=DUFFBACKOFF(k-1,3);
  UI i0=(k-1)>>LGNPAR;  /* # loops for 0 1 2 3 4 5 is x 0 0 0 0 1 used for tolerant */
@@ -260,25 +271,29 @@ static B eqvfl(I af,I wf,I m,I n,I k,D* RESTRICT av,D* RESTRICT wv,B* RESTRICT z
      case -8: u=_mm256_loadu_pd(x+7*NPAR); v=_mm256_loadu_pd(y+7*NPAR); allmatches=_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ));
      x+=8*NPAR; y+=8*NPAR;
      if(n2==1)goto oneloop;  // if we don't have to loop here, avoid the data-dependent branch and fold the comparisons into the last batch 
-     if(0xf!=_mm256_movemask_pd(allmatches))goto fail;
+// obsolete      if(0xf!=_mm256_movemask_pd(allmatches))goto fail;
+      if(!_mm256_testc_pd(allmatches,ones))goto fail;  // test is '!(all bits of allmatches=1)'
      }while(--i>0);
      }
     }
 oneloop:
     u=_mm256_maskload_pd(x,endmask); v=_mm256_maskload_pd(y,endmask); 
-    b ^= 0xf==_mm256_movemask_pd(_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ)));  // no miscompares, compare equal
+// obsolete     b ^= 0xf==_mm256_movemask_pd(_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ)));  // no miscompares, compare equal
+    b ^= _mm256_testc_pd(_mm256_and_pd(allmatches,_mm256_cmp_pd(u,v,_CMP_EQ_OQ)),ones);  // no miscompares, switch failure value to success.  test 1=good
    }else{
     // tolerant comparison
     __m256d cct=_mm256_broadcast_sd(&jt->cct);
     if(i0){
      UI i = i0;  // inner loop size
      do{  // unfortunately it's probably not worth checking for lengths 5-8 & we will have a misbranch whenever length > 4
       u=_mm256_loadu_pd(x); v=_mm256_loadu_pd(y); x+=NPAR; y+=NPAR;
-      if(0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ))))goto fail;
+// obsolete       if(0xf!=_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ))))goto fail;
+      if(!_mm256_testc_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)),ones))goto fail;
      }while(--i>0);
     }
     u=_mm256_maskload_pd(x,endmask); v=_mm256_maskload_pd(y,endmask); 
-    b ^= 0xf==_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)));
+// obsolete     b ^= 0xf==_mm256_movemask_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)));
+    b ^= _mm256_testc_pd(_mm256_xor_pd(_mm256_cmp_pd(u,_mm256_mul_pd(v,cct),_CMP_GT_OQ),_mm256_cmp_pd(v,_mm256_mul_pd(u,cct),_CMP_LE_OQ)),ones);
    }
 
 fail:
diff --git a/jsrc/vfrom.c b/jsrc/vfrom.c
@@ -772,7 +772,91 @@ static unsigned char jtmvmsparsex(J jt,void* const ctx,UI4 ti){
    limitrow=-1;  // init no eligible row found
   }else limitrow=0;  // for Dpiv, init to none found
 #if 0
-  // process the column NPAR values at a time
+ I epcol=AR(qk)==3;  // flag if we are doing an extended-precision column fetch  scaf move out of loop, and into limitrow
+ __m256d sgnbit=_mm256_broadcast_sd((D*)&Iimin);  // scaf move out of loop
+ __m256i rowstride=_mm256_set1_epi64x(n);  // number of Ds in a row of M, in each lane
+  // init for the column
+  __m256i endmask=_mm256_cmpeq_epi64(sgnbit,sgnbit); /* length mask for the last word */ 
+  __m256i indexes, rownums;  // row numbers and atom indexes for the rows we are fetching
+  I *bvgrd=bvgrd0; I i=-1; D bkold=inf, cold=1.0; I bkle0=1;
+  if(bv==0&&zv!=0){  // one-column mode: gather offsets address successive rows
+..
+  }
+  // create the column NPAR values at a time
+  do{
+   __m256i indexes;  // indexes for the rows we are fetching
+   __m256d dotproducth,dotproductl;  // where we build the column value
+   // get the validity mask for the gather and process: leave as ones until the end of the column
+   if(unlikely(bvgrde-bvgrd<NPAR))endmask = _mm256_loadu_si256((__m256i*)(validitymask+NPAR-(bvgrde-bvgrd)));  /* mask for 00=1111, 01=1000, 10=1100, 11=1110 */
+   // if DIP/Dpiv, fetch the row#s of the next group to process, in bkg order
+   if(bv!=0||zv==0){
+..
+   }else{
+    rownums=_mm256_add_epi64(rownums,_mm256_sll_epi64(rowstride,LGNPAR));  // advance NPAR rows to next sequential values
+   }
+   indexes=_mm256_mul_epu32(rowstride,rownums);
+   // Now mv and indexes are set up to read the correct rows
+   // get the next NPAR values by dot-product with Av
+   if(colx<n){
+    // fetching from the Ek matrix itself.  Just fetch the values from the column
+    dotproducth=_mm256_setzero_pd(); dotproducth=_mm256_mask_i64gather_pd(qkvh,mv0+colx,indexes,endmask,SZI);
+    if(epcol){dotproductl=_mm256_setzero_pd(); dotproductl=_mm256_mask_i64gather_pd(qkvh,mv0+n*n+colx,indexes,endmask,SZI);}
+   }else{
+    // fetching from A.  Form Ek row . A column for each of the 4 rows
+    I an=axv[colx][1];  // number of sparse atoms in each row
+    D *vv=avv0+axv[colx][0];  // pointer to values for this section of A
+    I *iv=amv0+axv[colx][0];  // pointer to row numbers of the values in *vv (these are the columns we fetch in turn from Ek)
+    if(likely(!epcol)){
+     // single-precision accumulate
+     dotproducth=_mm256_setzero_pd();
+     I k;
+     NOUNROLL for(k=0;k<an;++k){
+      dotproductl=_mm256_setzero_pd(); dotproductl=_mm256_mask_i64gather_pd(qkvh,mv0+iv[k],indexes,endmask,SZI);  // fetch from up to 4 rows
+      dotproducth=_mm256_fmadd_pd(dotproductl,_mm256_set1_pd(vv[k]),dotproducth);  // accumulate the dot-product
+     }
+    }else{
+     // extended-precision accumulate
+     __m256d th,tl,tl2,vval;  // temps for value loaded, and multiplier from A column
+     if(likely(an!=0){
+      // get column number to fetch; fetch 4 rows
+      th=_mm256_setzero_pd(); th=_mm256_mask_i64gather_pd(qkvh,mv0+iv[0],indexes,endmask,SZI);  // fetch from up to 4 rows
+      tl=_mm256_setzero_pd(); tl=_mm256_mask_i64gather_pd(qkvh,mv0+n*n+iv[0],indexes,endmask,SZI);  // fetch from up to 4 rows
+      vval=_mm256_set1_pd(vv[0]);  // load column value
+      // initialize the dotproduct with the first product
+      TWOPROD(th,vval,dotproducth,tl2)  // high qk * col
+      tl2=_mm256_fmadd_pd(tl,vval,tl2);  // low qk*col, and add in extension of prev product
+      TWOADD(dotproducth,tl2,dotproducth,dotproductl)  // combine high & low
+      I k;
+      for(k=1;k<an;++k){  // for each other element of the dot-product
+       // get column number to fetch; fetch 4 rows
+       th=_mm256_setzero_pd(); th=_mm256_mask_i64gather_pd(qkvh,mv0+iv[k],indexes,endmask,SZI);  // fetch from up to 4 rows
+       tl=_mm256_setzero_pd(); tl=_mm256_mask_i64gather_pd(qkvh,mv0+n*n+iv[k],indexes,endmask,SZI);  // fetch from up to 4 rows
+       vval=_mm256_set1_pd(vv[0]);  // load column value
+       // accumulate the dot-product
+       TWOPROD(th,vval,th,tl2)  // high qk * col
+       tl2=_mm256_fmadd_pd(tl,vval,tl2);  // low qk*col, and add in extension of prev product
+       TWOADD(dotproducth,th,dotproducth,vval) tl2=_mm256_add_pd(vval,tl2);  // add high parts & accum extension
+       TWOADD(dotproducth,tl2,dotproducth,dotproductl)  // combine high & extension for final form
+      }
+     }else{dotproducth=dotproductl=_mm256_setzero_pd();}  // no columns should not occur.  Just 1 shouldn't either
+    }
+   }
+   // process the NPAR generated values
+   if(bv==0&&zv!=0){
+    // one-column mode: just store out the values
+..
+   }else{
+    // DIP/Dpiv mode: process each value in turn.  Since 0 values are never pivots, we can stop when all remaining values are 0
+    indexes=rownums;  // repurpose indexes to hold the row-number we are working on
+    while(_mm256_movemask_pd(_mm256_cmp_pd(dotproducth,_mm256_setzero_pd(),_CMP_EQ_OQ))!=0xf)){
+     dotprod=_mm256_permute4x64_pd(dotproducth,0b00000000);  // copy next value into all lanes
+     dotproducth=_mm256_permute4x64_pd(dotproducth,0b11111001); dotproducth=_mm256_blend_pd(dotproducth,_mm256_setzero_pd(),0b1000) // shift down one value for next time
+     i=_mm256_extract_epi64(indexes,0); indexes=_mm256_permute4x64_pd(indexes,0b11111001); // get the row number we are trying to swap out; shift row number down for next loop
+     PROCESSROWRATIOS
+    }
+   }
+  }
+  }while((bvgrd+=NPAR)<bvgrde);
 #else
 #define COLLPINIT I *bvgrd=bvgrd0; I i=-1; D *mv=mv0-n; D bkold=inf, cold=1.0; I bkle0=1;
 #define COLLP do{if(unlikely(zv!=0)){++i; mv+=n;}else{i=*bvgrd; mv=mv0+n*i;} // for each row, i is the row#, mv points to the beginning of the row of M.  If we take the whole col, take it in order for cache.  Prefetch next row?
@@ -970,7 +1054,7 @@ static unsigned char jtmvmsparsex(J jt,void* const ctx,UI4 ti){
 //  rc=5 (not created - means problem is infeasible) rc=6=empty M, problem is malformed
 // if the exclusion list is given, we stop on the first nonimproving pivot, and the exclusion list is used to prevent repetition of basis
 // If Frow is empty, we are looking for nonimproving pivots in rows where the selector is 0.  In that case the bkgrd puts the bk values in descending order.  We return the first column that will make more 0 B rows non0 than non0 B rows 0.
-// If bk is empty, we are counting the #places where c>=PivTol and accumulating into Dpiv under control of Dpivdir (-1=decr, 1=incr; init to 0 if neg)
+// If bk is empty, we are looking in bkgrd columns and counting the #places where c>=PivTol and accumulating into Dpiv under control of Dpivdir (-1=decr, 1=incr; init to 0 if neg)
 // Rank is infinite
 F1(jtmvmsparse){PROLOG(832);
 #if C_AVX2