You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if(n2==1)goto oneloop; // if we don't have to loop here, avoid the data-dependent branch and fold the comparisons into the last batch
118
118
// obsolete if(~_mm256_movemask_epi8(allmatches))goto fail; // if searches are long, kick out when there is a miscompare
119
-
if(!_mm256_testc_si256(allmatches,ones))goto fail; // if searches are long, kick out when there is a miscompare. test is '!(all bits of allmatches =1)'
119
+
if(!_mm256_testc_pd(_mm256_castsi256_pd(allmatches),ones))goto fail; // if searches are long, kick out when there is a miscompare. test is '!(all bits of allmatches =1)'
b ^= _mm256_testc_si256(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v)),ones); // no miscompares, switch failure value to success. test 1=good
125
+
b ^= _mm256_testc_pd(_mm256_castsi256_pd(_mm256_and_si256(allmatches,_mm256_cmpeq_epi64(u,v))),ones); // no miscompares, switch failure value to success. test 1=good
126
126
// obsolete b ^= 0==~_mm256_movemask_epi8(_mm256_and_si256(allmatches,_mm256_cmpeq_epi8(u,v))); // no miscompares, switch failure value to success
127
127
fail:
128
128
*z++=b; // store one result
@@ -146,7 +146,7 @@ I memcmpne(void *s, void *t, I l){
146
146
In=(l-1)>>LGSZI; // number of Ds to process - cannot be 0
147
147
__m256iu,v;
148
148
__m256iendmask=_mm256_loadu_si256((__m256i*)(validitymask+((-n)&(NPAR-1)))); // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001
Copy file name to clipboardExpand all lines: jsrc/va1.c
+2-1Lines changed: 2 additions & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -58,7 +58,8 @@ AHDR1(sqrtD,D,D){
58
58
u=_mm256_sqrt_pd(_mm256_blendv_pd(u,comp,neg)); comp=_mm256_sub_pd(zero,u); u=_mm256_blendv_pd(u,comp,neg); // store sqrt, with sign of the original value
59
59
60
60
,
61
-
R (_mm256_movemask_pd(anyneg)&0xf)?EWIMAG:EVOK; // if there are any negative values, call for a postpass
61
+
// obsolete R (_mm256_movemask_pd(anyneg)&0xf)?EWIMAG:EVOK; // if there are any negative values, call for a postpass
62
+
R (!_mm256_testc_pd(zero,anyneg))?EWIMAG:EVOK; // if there are any negative values, call for a postpass
zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)));oflo=_mm256_or_pd(oflo,_mm256_and_pd(zz,yy));, // only oflo is b - imin,
indexes0=_mm256_loadu_si256((__m256i*)av); // fetch a block of indexes
125
125
indexes0=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes0),_mm256_castsi256_pd(_mm256_add_epi64(indexes0,wstride)),_mm256_castsi256_pd(indexes0))); // get indexes, add axis len if neg
126
-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes0,_mm256_sub_epi64(indexes0,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
126
+
// obsolete ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes0,_mm256_sub_epi64(indexes0,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
127
+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes0,_mm256_sub_epi64(indexes0,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
127
128
if(an>2*NPAR){
128
129
indexes1=_mm256_loadu_si256((__m256i*)(av+NPAR)); // fetch a block of indexes
129
130
indexes1=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes1),_mm256_castsi256_pd(_mm256_add_epi64(indexes1,wstride)),_mm256_castsi256_pd(indexes1))); // get indexes, add axis len if neg
130
-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes1,_mm256_sub_epi64(indexes1,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
131
+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes1,_mm256_sub_epi64(indexes1,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
131
132
if(an>3*NPAR){
132
133
indexes2=_mm256_loadu_si256((__m256i*)(av+2*NPAR)); // fetch a block of indexes
133
134
indexes2=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes2),_mm256_castsi256_pd(_mm256_add_epi64(indexes2,wstride)),_mm256_castsi256_pd(indexes2))); // get indexes, add axis len if neg
134
-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes2,_mm256_sub_epi64(indexes2,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
135
+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes2,_mm256_sub_epi64(indexes2,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
135
136
}
136
137
}
137
138
}
138
139
if(an<=4*NPAR){
139
140
indexesn=_mm256_maskload_epi64(av+((an-1)&-NPAR),endmask); // fetch last block of indexes
140
141
indexesn=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexesn),_mm256_castsi256_pd(_mm256_add_epi64(indexesn,wstride)),_mm256_castsi256_pd(indexesn))); // get indexes, add axis len if neg
141
-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesn,_mm256_sub_epi64(indexesn,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
142
+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesn,_mm256_sub_epi64(indexesn,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
indexesn=_mm256_loadu_si256((__m256i*)(av+3*NPAR)); // fetch last block of indexes
160
161
indexesn=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexesn),_mm256_castsi256_pd(_mm256_add_epi64(indexesn,wstride)),_mm256_castsi256_pd(indexesn))); // get indexes, add axis len if neg
161
-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesn,_mm256_sub_epi64(indexesn,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
162
+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesn,_mm256_sub_epi64(indexesn,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
162
163
do{
163
164
// 17+indexes. We must read the tail repeatedly
164
165
// this first execution audits the indexes and converts negatives
__m256iindexesx=indexes; // fetch a block of indexes
175
176
indexes=_mm256_loadu_si256((__m256i*)avv); avv+=NPAR; // fetch a block of indexes
176
177
anynegindex=_mm256_or_si256(anynegindex,indexesx); indexesx=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexesx),_mm256_castsi256_pd(_mm256_add_epi64(indexesx,wstride)),_mm256_castsi256_pd(indexesx))); // get indexes, add axis len if neg
177
-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesx,_mm256_sub_epi64(indexesx,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
178
+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexesx,_mm256_sub_epi64(indexesx,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
anynegindex=_mm256_or_si256(anynegindex,indexes); indexes=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes),_mm256_castsi256_pd(_mm256_add_epi64(indexes,wstride)),_mm256_castsi256_pd(indexes))); // get indexes, add axis len if neg
181
-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes,_mm256_sub_epi64(indexes,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
182
+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes,_mm256_sub_epi64(indexes,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
indexes=_mm256_maskload_epi64(avv,endmask); // fetch a block of indexes
186
187
anynegindex=_mm256_or_si256(anynegindex,indexes); indexes=_mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(indexes),_mm256_castsi256_pd(_mm256_add_epi64(indexes,wstride)),_mm256_castsi256_pd(indexes))); // get indexes, add axis len if neg. unfetched indexes are 0
187
-
ASSERT(_mm256_movemask_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes,_mm256_sub_epi64(indexes,wstride))))==0xf,EVINDEX); // positive, and negative if you subtract axis length
188
+
ASSERT(_mm256_testc_pd(_mm256_castsi256_pd(_mm256_andnot_si256(indexes,_mm256_sub_epi64(indexes,wstride))),_mm256_castsi256_pd(ones)),EVINDEX); // positive, and negative if you subtract axis length
188
189
_mm256_maskstore_epi64(x, endmask, _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),v,indexes,endmask,SZI)); x+=((an-1)&(NPAR-1))+1; // must use a different reg for source and index, lest VS2013 create an illegal instruction
0 commit comments