Skip to content

Commit 340e217

Browse files
committed
finish testc_pd
1 parent 3f83076 commit 340e217

7 files changed

Lines changed: 60 additions & 45 deletions

File tree

jsrc/v1.c

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -98,31 +98,31 @@ static B eqv(I af,I wf,I m,I n,I k,C* RESTRICT av,C* RESTRICT wv,B* RESTRICT z,B
9898
// fetch the load mask for the last block: the words to load, including any trailing fragment
9999
// step up to qword boundary
100100
I *x=(I*)((C*)av+((k-1)&(SZI-1))+1), *y=(I*)((C*)wv+((k-1)&(SZI-1))+1); // access the arguments as Is
101-
__m256i allmatches =_mm256_cmpeq_epi8(endmask,endmask),ones=allmatches; // accumuland for compares init to all 1
101+
__m256i allmatches =_mm256_cmpeq_epi8(endmask,endmask); __m256d ones=_mm256_castsi256_pd(allmatches); // accumuland for compares init to all 1
102102
b=b1; // init store value to compare failure
103103
if(n2>0){
104104
UI i = n2; // inner loop size
105105
x+=(backoff+1)*NPAR; y+=(backoff+1)*NPAR;
106106
switch(backoff){
107107
do{
108-
case -1: u=_mm256_loadu_si256 ((__m256i*)x); v=_mm256_loadu_si256 ((__m256i*)y); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
109-
case -2: u=_mm256_loadu_si256 ((__m256i*)(x+1*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+1*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
110-
case -3: u=_mm256_loadu_si256 ((__m256i*)(x+2*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+2*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
111-
case -4: u=_mm256_loadu_si256 ((__m256i*)(x+3*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+3*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
112-
case -5: u=_mm256_loadu_si256 ((__m256i*)(x+4*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+4*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
113-
case -6: u=_mm256_loadu_si256 ((__m256i*)(x+5*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+5*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
114-
case -7: u=_mm256_loadu_si256 ((__m256i*)(x+6*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+6*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
115-
case -8: u=_mm256_loadu_si256 ((__m256i*)(x+7*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+7*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
108+
case -1: u=_mm256_loadu_si256 ((__m256i*)x); v=_mm256_loadu_si256 ((__m256i*)y); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
109+
case -2: u=_mm256_loadu_si256 ((__m256i*)(x+1*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+1*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
110+
case -3: u=_mm256_loadu_si256 ((__m256i*)(x+2*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+2*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
111+
case -4: u=_mm256_loadu_si256 ((__m256i*)(x+3*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+3*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
112+
case -5: u=_mm256_loadu_si256 ((__m256i*)(x+4*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+4*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
113+
case -6: u=_mm256_loadu_si256 ((__m256i*)(x+5*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+5*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
114+
case -7: u=_mm256_loadu_si256 ((__m256i*)(x+6*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+6*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
115+
case -8: u=_mm256_loadu_si256 ((__m256i*)(x+7*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+7*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
116116
x+=8*NPAR; y+=8*NPAR;
117117
if(n2==1)goto oneloop; // if we don't have to loop here, avoid the data-dependent branch and fold the comparisons into the last batch
118118
// obsolete if(~_mm256_movemask_epi8(allmatches))goto fail; // if searches are long, kick out when there is a miscompare
119-
if(!_mm256_testc_si256(allmatches,ones))goto fail; // if searches are long, kick out when there is a miscompare. test is '!(all bits of allmatches =1)'
119+
if(!_mm256_testc_pd(_mm256_castsi256_pd(allmatches),ones))goto fail; // if searches are long, kick out when there is a miscompare. test is '!(all bits of allmatches =1)'
120120
}while(--i>0);
121121
}
122122
oneloop:;
123123
}
124124
u=_mm256_maskload_epi64(x,endmask); v=_mm256_maskload_epi64(y,endmask);
125-
b ^= _mm256_testc_si256(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v)),ones); // no miscompares, switch failure value to success. test 1=good
125+
b ^= _mm256_testc_pd(_mm256_castsi256_pd(_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v))),ones); // no miscompares, switch failure value to success. test 1=good
126126
// obsolete b ^= 0==~_mm256_movemask_epi8(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v))); // no miscompares, switch failure value to success
127127
fail:
128128
*z++=b; // store one result
@@ -146,7 +146,7 @@ I memcmpne(void *s, void *t, I l){
146146
I n=(l-1)>>LGSZI; // number of Ds to process - cannot be 0
147147
__m256i u,v;
148148
__m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-n)&(NPAR-1)))); // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001
149-
__m256i ones=_mm256_cmpeq_epi8(endmask,endmask);
149+
__m256d ones=_mm256_castsi256_pd(_mm256_cmpeq_epi8(endmask,endmask));
150150

151151
UI n2=DUFFLPCT(n-1,3); /* # turns through duff loop */
152152
if(n2>0){
@@ -155,23 +155,23 @@ I memcmpne(void *s, void *t, I l){
155155
x+=(backoff+1)*NPAR; y+=(backoff+1)*NPAR;
156156
switch(backoff){
157157
do{
158-
case -1: u=_mm256_loadu_si256 ((__m256i*)x); v=_mm256_loadu_si256 ((__m256i*)y); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
159-
case -2: u=_mm256_loadu_si256 ((__m256i*)(x+1*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+1*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
160-
case -3: u=_mm256_loadu_si256 ((__m256i*)(x+2*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+2*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
161-
case -4: u=_mm256_loadu_si256 ((__m256i*)(x+3*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+3*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
162-
case -5: u=_mm256_loadu_si256 ((__m256i*)(x+4*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+4*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
163-
case -6: u=_mm256_loadu_si256 ((__m256i*)(x+5*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+5*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
164-
case -7: u=_mm256_loadu_si256 ((__m256i*)(x+6*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+6*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
165-
case -8: u=_mm256_loadu_si256 ((__m256i*)(x+7*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+7*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v));
158+
case -1: u=_mm256_loadu_si256 ((__m256i*)x); v=_mm256_loadu_si256 ((__m256i*)y); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
159+
case -2: u=_mm256_loadu_si256 ((__m256i*)(x+1*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+1*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
160+
case -3: u=_mm256_loadu_si256 ((__m256i*)(x+2*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+2*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
161+
case -4: u=_mm256_loadu_si256 ((__m256i*)(x+3*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+3*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
162+
case -5: u=_mm256_loadu_si256 ((__m256i*)(x+4*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+4*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
163+
case -6: u=_mm256_loadu_si256 ((__m256i*)(x+5*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+5*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
164+
case -7: u=_mm256_loadu_si256 ((__m256i*)(x+6*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+6*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
165+
case -8: u=_mm256_loadu_si256 ((__m256i*)(x+7*NPAR)); v=_mm256_loadu_si256 ((__m256i*)(y+7*NPAR)); allmatches=_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v));
166166
x+=8*NPAR; y+=8*NPAR;
167167
// obsolete if(~_mm256_movemask_epi8(allmatches))R 1;
168-
if(!_mm256_testc_si256(allmatches,ones))R 1; // test is '!(all bits of allmatches=1)'
168+
if(!_mm256_testc_pd(_mm256_castsi256_pd(allmatches),ones))R 1; // test is '!(all bits of allmatches=1)'
169169
}while(--n2>0);
170170
}
171171
}
172172

173173
u=_mm256_maskload_epi64(x,endmask); v=_mm256_maskload_epi64(y,endmask);
174-
R !_mm256_testc_si256(_mm256_cmpeq_epi8(u,v),ones); // return 1 if any mismatch
174+
R !_mm256_testc_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(u,v)),ones); // return 1 if any mismatch
175175
// obsolete R 0!=~_mm256_movemask_epi8(_mm256_cmpeq_epi8(u,v)); // no miscompares, compare equal
176176
}
177177

jsrc/va1.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ AHDR1(sqrtD,D,D){
5858
u=_mm256_sqrt_pd(_mm256_blendv_pd(u,comp,neg)); comp=_mm256_sub_pd(zero,u); u=_mm256_blendv_pd(u,comp,neg); // store sqrt, with sign of the original value
5959

6060
,
61-
R (_mm256_movemask_pd(anyneg)&0xf)?EWIMAG:EVOK; // if there are any negative values, call for a postpass
61+
// obsolete R (_mm256_movemask_pd(anyneg)&0xf)?EWIMAG:EVOK; // if there are any negative values, call for a postpass
62+
R (!_mm256_testc_pd(zero,anyneg))?EWIMAG:EVOK; // if there are any negative values, call for a postpass
6263
)
6364
}
6465

jsrc/ve.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,16 @@ primop256(plusDB,0xa00,,zz=_mm256_add_pd(xx,yy),R EVOK;)
4444
primop256(plusBD,0x900,,zz=_mm256_add_pd(xx,yy),R EVOK;)
4545
primop256(plusII,0x21,__m256d oflo=_mm256_setzero_pd();,
4646
zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_andnot_pd(_mm256_xor_pd(xx,yy),_mm256_xor_pd(xx,zz)));,
47-
R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPPLUSII:EVOK;)
47+
// obsolete R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPPLUSII:EVOK;)
48+
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPPLUSII:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
4849
primop256(plusBI,0x860,__m256d oflo=_mm256_setzero_pd();,
4950
zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_castsi256_pd(_mm256_cmpgt_epi32(_mm256_castpd_si256(yy),_mm256_castpd_si256(zz))));,
50-
R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPPLUSBI:EVOK;)
51+
// obsolete R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPPLUSBI:EVOK;)
52+
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPPLUSBI:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
5153
primop256(plusIB,0x8a0,__m256d oflo=_mm256_setzero_pd();,
5254
zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_castsi256_pd(_mm256_cmpgt_epi32(_mm256_castpd_si256(xx),_mm256_castpd_si256(zz))));,
53-
R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPPLUSIB:EVOK;)
55+
// obsolete R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPPLUSIB:EVOK;)
56+
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPPLUSIB:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
5457
primop256(plusBB,0xc0,,
5558
zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)));,R EVOK;)
5659
primop256(minusDI,16,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
@@ -59,13 +62,16 @@ primop256(minusDB,0xa00,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
5962
primop256(minusBD,0x100,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
6063
primop256(minusII,0x22,__m256d oflo=_mm256_setzero_pd();,
6164
zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_and_pd(_mm256_xor_pd(xx,yy),_mm256_xor_pd(xx,zz)));,
62-
R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPMINUSII:EVOK;)
65+
// obsolete R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPMINUSII:EVOK;)
66+
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPMINUSII:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
6367
primop256(minusBI,0x62,__m256d oflo=_mm256_setzero_pd();,
6468
zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)));oflo=_mm256_or_pd(oflo,_mm256_and_pd(zz,yy));, // only oflo is b - imin,
65-
R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPMINUSBI:EVOK;)
69+
// obsolete R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPMINUSBI:EVOK;)
70+
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPMINUSBI:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
6671
primop256(minusIB,0x8a2,__m256d oflo=_mm256_setzero_pd();,
6772
zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_castsi256_pd(_mm256_cmpgt_epi64(_mm256_castpd_si256(zz),_mm256_castpd_si256(xx))));,
68-
R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPMINUSIB:EVOK;)
73+
// obsolete R _mm256_movemask_pd(oflo)?EWOVIP+EWOVIPMINUSIB:EVOK;)
74+
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPMINUSIB:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
6975
primop256(minusBB,0xe0,,
7076
zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)));,R EVOK;)
7177
primop256(minDI,16,,zz=_mm256_min_pd(xx,yy),R EVOK;)

jsrc/vfrom.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,22 +123,23 @@ F2(jtifrom){A z;C*wv,*zv;I acr,an,ar,*av,j,k,m,p,pq,q,wcr,wf,wn,wr,*ws,zn;
123123
if(an>NPAR){
124124
indexes0=_mm256_loadu_si256((__m256i*)av); // fetch a block of indexes
125125
indexes0=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes0),_mm256_castsi256_pd(_mm256_add_epi64(indexes0,wstride)),_mm256_castsi256_pd(indexes0))); // get indexes, add axis len if neg
126-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes0,_mm256_sub_epi64(indexes0,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
126+
// obsolete ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes0,_mm256_sub_epi64(indexes0,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
127+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes0,_mm256_sub_epi64(indexes0,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
127128
if(an>2*NPAR){
128129
indexes1=_mm256_loadu_si256((__m256i*)(av+NPAR)); // fetch a block of indexes
129130
indexes1=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes1),_mm256_castsi256_pd(_mm256_add_epi64(indexes1,wstride)),_mm256_castsi256_pd(indexes1))); // get indexes, add axis len if neg
130-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes1,_mm256_sub_epi64(indexes1,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
131+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes1,_mm256_sub_epi64(indexes1,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
131132
if(an>3*NPAR){
132133
indexes2=_mm256_loadu_si256((__m256i*)(av+2*NPAR)); // fetch a block of indexes
133134
indexes2=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes2),_mm256_castsi256_pd(_mm256_add_epi64(indexes2,wstride)),_mm256_castsi256_pd(indexes2))); // get indexes, add axis len if neg
134-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes2,_mm256_sub_epi64(indexes2,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
135+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes2,_mm256_sub_epi64(indexes2,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
135136
}
136137
}
137138
}
138139
if(an<=4*NPAR){
139140
indexesn=_mm256_maskload_epi64(av+((an-1)&-NPAR),endmask); // fetch last block of indexes
140141
indexesn=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexesn),_mm256_castsi256_pd(_mm256_add_epi64(indexesn,wstride)),_mm256_castsi256_pd(indexesn))); // get indexes, add axis len if neg
141-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesn,_mm256_sub_epi64(indexesn,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
142+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesn,_mm256_sub_epi64(indexesn,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
142143
// Now do the gather/writes
143144
if(an<=NPAR){
144145
do{_mm256_maskstore_epi64(x, endmask, _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),v,indexesn,endmask,SZI)); v+=p; x+=an;}while(--i);
@@ -158,7 +159,7 @@ F2(jtifrom){A z;C*wv,*zv;I acr,an,ar,*av,j,k,m,p,pq,q,wcr,wf,wn,wr,*ws,zn;
158159
}else{
159160
indexesn=_mm256_loadu_si256((__m256i*)(av+3*NPAR)); // fetch last block of indexes
160161
indexesn=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexesn),_mm256_castsi256_pd(_mm256_add_epi64(indexesn,wstride)),_mm256_castsi256_pd(indexesn))); // get indexes, add axis len if neg
161-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesn,_mm256_sub_epi64(indexesn,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
162+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesn,_mm256_sub_epi64(indexesn,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
162163
do{
163164
// 17+indexes. We must read the tail repeatedly
164165
// this first execution audits the indexes and converts negatives
@@ -174,17 +175,17 @@ F2(jtifrom){A z;C*wv,*zv;I acr,an,ar,*av,j,k,m,p,pq,q,wcr,wf,wn,wr,*ws,zn;
174175
__m256i indexesx=indexes; // fetch a block of indexes
175176
indexes=_mm256_loadu_si256((__m256i*)avv); avv+=NPAR; // fetch a block of indexes
176177
anynegindex=_mm256_or_si256(anynegindex,indexesx); indexesx=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexesx),_mm256_castsi256_pd(_mm256_add_epi64(indexesx,wstride)),_mm256_castsi256_pd(indexesx))); // get indexes, add axis len if neg
177-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesx,_mm256_sub_epi64(indexesx,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
178+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesx,_mm256_sub_epi64(indexesx,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
178179
_mm256_storeu_si256((__m256i*)x, _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),v,indexesx,ones,SZI)); x+=NPAR;
179180
)
180181
anynegindex=_mm256_or_si256(anynegindex,indexes); indexes=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes),_mm256_castsi256_pd(_mm256_add_epi64(indexes,wstride)),_mm256_castsi256_pd(indexes))); // get indexes, add axis len if neg
181-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes,_mm256_sub_epi64(indexes,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
182+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes,_mm256_sub_epi64(indexes,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
182183
_mm256_storeu_si256((__m256i*)x, _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),v,indexes,ones,SZI)); x+=NPAR;
183184
}
184185
// runout using mask
185186
indexes=_mm256_maskload_epi64(avv,endmask); // fetch a block of indexes
186187
anynegindex=_mm256_or_si256(anynegindex,indexes); indexes=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes),_mm256_castsi256_pd(_mm256_add_epi64(indexes,wstride)),_mm256_castsi256_pd(indexes))); // get indexes, add axis len if neg. unfetched indexes are 0
187-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes,_mm256_sub_epi64(indexes,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
188+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes,_mm256_sub_epi64(indexes,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
188189
_mm256_maskstore_epi64(x, endmask, _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),v,indexes,endmask,SZI)); x+=((an-1)&(NPAR-1))+1; // must use a different reg for source and index, lest VS2013 create an illegal instruction
189190
v+=p; // advance to next input cell
190191
--i;

0 commit comments

Comments
 (0)