|
| 1 | +// Common routines for i.-family verbs |
| 2 | + |
| 3 | +#include "vcomp.h" |
| 4 | + |
| 5 | +/* |
| 6 | + mode one of the following: |
| 7 | + 0 IIDOT i. a w rank |
| 8 | + 1 IICO i: a w rank |
| 9 | + 2 INUBSV ~: w rank |
| 10 | + 3 INUB ~. w |
| 11 | + 4 ILESS -. a w |
| 12 | + 5 INUBI I.@~: w |
| 13 | + 6 IEPS e. a w rank |
| 14 | + 7 II0EPS e.i.0: a w |
| 15 | + 8 II1EPS e.i.1: a w |
| 16 | + 9 IJ0EPS e.i:0: a w |
| 17 | + 10 IJ1EPS e.i:1: a w |
| 18 | + 11 ISUMEPS [:+ /e. a w |
| 19 | + 12 IANYEPS [:+./e. a w |
| 20 | + 13 IALLEPS [:*./e. a w |
| 21 | + 14 IIFBEPS I.@e. a w |
| 22 | + 30 IPHIDOT i. a w prehashed |
| 23 | + 31 IPHICO i: a w prehashed |
| 24 | + 34 IPHLESS -. a w prehashed no longer supported |
| 25 | + 36 IPHEPS e. a w prehashed |
| 26 | + 37 IPHI0EPS e.i.0: a w prehashed |
| 27 | + 38 IPHI1EPS e.i.1: a w prehashed |
| 28 | + 39 IPHJ0EPS e.i:0: a w prehashed |
| 29 | + 40 IPHJ1EPS e.i:1: a w prehashed |
| 30 | + 41 IPHSUMEPS [:+ /e. a w prehashed |
| 31 | + 42 IPHANYEPS [:+./e. a w prehashed |
| 32 | + 43 IPHALLEPS [:*./e. a w prehashed |
| 33 | + 44 IPHIFBEPS I.@e. a w prehashed |
| 34 | + asct target axis length (number of things to be searched from in a single pass) |
| 35 | + n target item # atoms |
| 36 | + wsct # target items in a left-arg cell, which may include multiple right-arg cells (number of searches) |
| 37 | + k target item # bytes |
| 38 | + ac # left arg cells (cells, NOT items) |
| 39 | + wc # right arg cells |
| 40 | + ak # bytes left arg cells, or 0 if only 1 cell scaf unnecessary? not used in smallrange |
| 41 | + wk # bytes right arg cells, or 0 if only one cell |
| 42 | + a left arg |
| 43 | + w right arg, or mark for m&i. or m&i: or e.&n or -.&n |
| 44 | + h pointer to hash table or to 0 |
| 45 | + z result |
| 46 | +*/ |
| 47 | + |
| 48 | +#define IOF(f) A f(J jt,I mode,I n,I asct,I wsct,I ac,I wc,A a,A w,A z,I k,I ak,I wk,A h) // asct/wsct go by m and c |
| 49 | +//J jt,I mode,I n,I asct,I wsct,I ac,I wc,A a,A w,A z |
| 50 | +// variables used in IOF routines: |
| 51 | +// h=A for hashtable, hv->hashtable data, p=#entries in table, pm=unsigned p, used for converting hash to bucket# |
| 52 | +// acn,wcn=#atoms in cell of a,w |
| 53 | +// cn = #atoms in target item |
| 54 | +// j is the starting bucket number of the hashtable search |
| 55 | +// hj is the index of the first empty-or-matching bucket encountered |
| 56 | +// zv->result area (as an array of pointers), av->a data, wv->w data |
| 57 | + |
| 58 | + |
| 59 | + |
| 60 | +// get hash slot and set up for the search. j holds the slot |
| 61 | +#define HASHSLOT(hash) j=((hash)*p)>>32; |
| 62 | + |
| 63 | +// Misc code to set the shape once we see how many results there are, used for ~. y and x -. y |
| 64 | +#define ZISHAPE AS(z)[0]=AN(z)=zi-zv |
| 65 | +#define ZCSHAPE AS(z)[0]=(zc-(C*)zv)/k; AN(z)=n*AS(z)[0] |
| 66 | +#define ZUSHAPE(T) AS(z)[0]= zu-(T*)zv; AN(z)=n*AS(z)[0] |
| 67 | + |
| 68 | +// Calculate the hash slot. The hash calculation (input parm) relies on the name v and produces the name j. We have moved v to an xmm register to reduce register pressure |
| 69 | +// here, so extract its parts for use as needed |
| 70 | +#define HASHSLOTP(T,hash) v=(T*)_mm_extract_epi64(vp,0); j=((hash)*(UIL)(p))>>32; |
| 71 | +// Conditionally insert a new value into the hash table. The initial value of hj (the table scan pointer) has been fetched. name is the name holding the slot to be added |
| 72 | +// (it will be j, j1, or j2 depending on where we are in the processing pipeline), |
| 73 | +// exp is a comparison expression meaning 'mismatch' that returns 0 if the data indexed by the slot is equal to *v (the expression will use *v as the new value, and hj as the index into the |
| 74 | +// original input av to point to the value represented in the hashtable). hj advances through the hash until an empty slot or a match is found. |
| 75 | +// The v value is stored in an xmm register and brought out into v for use by (exp); it is a delayed version of the v that was used for HASHSLOT. |
| 76 | +// The table search goes in descending order, and wraps around to p-1 (the last entry in the table). |
| 77 | +// What happens after the search depends on the last 3 parameters: |
| 78 | +// If (store) is 1, the value of i (which is the loop index giving the position within a of the item being processed) is stored into the empty hash slot, |
| 79 | +// only if the hash search does not find a match. If (store) is 2, the entry that we found is cleared, by setting it to maxcount+1, when we find a match. |
| 80 | +// When (store)=2, we also ignore hash entries containing maxcount+1, treating them as failed compares |
| 81 | +// Independent of (store), (fstmt) is executed if the item is found in the hash table, and (nfstmt) is executed if it is not found. |
| 82 | +#define FINDP(T,TH,hsrc,name,exp,fstmt,nfstmt,store) NOUNROLL do{if(hj==hsrc##sct){ \ |
| 83 | + if(store==1)hv[name]=(TH)i; nfstmt break;} /* this is the not-found case */ \ |
| 84 | + if((store!=2||hj<hsrc##sct)&&(v=(T*)_mm_extract_epi64(vp,1),!(exp))){if(store==2)hv[name]=(TH)(hsrc##sct+1); fstmt break;} /* found */ \ |
| 85 | + if(unlikely(--name<0))name+=p; hj=hv[name]; /* miscompare, nust continue search */ \ |
| 86 | + }while(1); |
| 87 | + |
| 88 | +// Traverse the hash table for one argument. (src) indicates which argument, a or w, we are looping through; (hsrc) indicates which argument provided the hash table. |
| 89 | +// For each item we do HASHSLOT folowed by FINDP, and adjust the (v) values (both stored in xmm variable vp) to keep track |
| 90 | +// of which input item is being operated on. This loop is triple-unrolled, so that after we HASHSLOT for entry q, we FINDP for entry q-2. As soon as we HASHSLOT for entry |
| 91 | +// q, we prefetch it into L1 cache; then as soon as we have finished the last store for entry q-1, we do the fetch for entry q (which will be fetching while the hash for entry |
| 92 | +// q+2 is being calculated). |
| 93 | +// The (fstmt,nfstmt,store) arguments indicate what to do when a match/notmatch is resolved. |
| 94 | +// (loopctl) give the stride through the input array, the control for the main loop, and the index of the last value. These values differ for forward and reverse scans through the input. |
| 95 | +#define XSEARCH(T,TH,src,hsrc,hash,exp,stride,fstmt,nfstmt,store,vpofst,loopctl,finali) \ |
| 96 | + {I i, j, hj; T *v; vp=_mm_insert_epi64(vp,(I)(src##v+vpofst),0); vpstride = _mm_insert_epi64(vp,(stride)*(I)sizeof(T),0); vp=_mm_shuffle_epi32(vp,0x44); vpstride=_mm_insert_epi64(vpstride,0LL,1); \ |
| 97 | + HASHSLOTP(T,hash) if(src##sct>1){I j1,j2; vp=_mm_add_epi64(vp,vpstride); j1=j; HASHSLOTP(T,hash) hj=hv[j1]; vp=_mm_add_epi64(vp,vpstride); vpstride=_mm_shuffle_epi32(vpstride,0x44); \ |
| 98 | + for loopctl {j2=j1; j1=j; HASHSLOTP(T,hash) PREFETCH((C*)&hv[j]); FINDP(T,TH,hsrc,j2,exp,fstmt,nfstmt,store); vp=_mm_add_epi64(vp,vpstride); hj=hv[j1];} \ |
| 99 | + FINDP(T,TH,hsrc,j1,exp,fstmt,nfstmt,store); vp=_mm_add_epi64(vp,vpstride);} hj=hv[j]; i=finali; FINDP(T,TH,hsrc,j,exp,fstmt,nfstmt,store); } |
| 100 | + |
| 101 | +// Traverse a in forward direction, adding values to the hash table |
| 102 | +#define XDOAP(T,TH,hash,exp,stride) XSEARCH(T,TH,a,a,hash,exp,stride,{},{},1,0, (i=0;i<asct-2;++i) ,asct-1) |
| 103 | +// Traverse w in forward direction, executing fstmt/nfstmt depending on found/notfound; and adding to the hash if (reflex) is 1, indicating a reflexive operation |
| 104 | +#define XDOP(T,TH,hash,exp,stride,fstmt,nfstmt,reflex) XSEARCH(T,TH,w,a,hash,exp,stride,fstmt,nfstmt,reflex,0, (i=0;i<wsct-2;++i) ,wsct-1) |
| 105 | +// same for traversing a/w in reverse |
| 106 | +#define XDQAP(T,TH,hash,exp,stride) XSEARCH(T,TH,a,a,hash,exp,(-(stride)),{},{},1,cn*(asct-1), (i=asct-1;i>1;--i) ,0) |
| 107 | +#define XDQP(T,TH,hash,exp,stride,fstmt,nfstmt,reflex) XSEARCH(T,TH,w,a,hash,exp,(-(stride)),fstmt,nfstmt,reflex,cn*(wsct-1), (i=wsct-1;i>1;--i) ,0) |
| 108 | + |
| 109 | +// b is a bit index; BYTENO is byte number, BITNO is bit within byte |
| 110 | +#define BYTENO(b) ((b)>>3) |
| 111 | +#define BITNO(b) ((b)&7) |
| 112 | + |
| 113 | +I hashallo(IH *RESTRICT,UI,UI,I); |
| 114 | +UI cthia(UIL,D,A); |
| 115 | +UI jthiau(J,A); |
| 116 | +B jteqnx(J,I,X*,X*),jteqnq(J,I,Q*,Q*),jteqa(J,I,A*,A*); |
| 117 | + |
| 118 | +// Hash a single INT-sized atom |
| 119 | +#define hici1(x) CRC32L(-1LL,*(x)) |
| 120 | + |
| 121 | +// obsolete #define RETCRC3 R CRC32L(crc0,CRC32L(crc1,crc2)) |
| 122 | +#define RETCRC3 R CRC32L(crc0,(UI)rol32(crc1,9)+((UI)rol32(crc2,21)<<32)) |
| 123 | + |
| 124 | +// Hash an INT list |
| 125 | +static __forceinline UI hici(I k, UI* v){ |
| 126 | + // Owing to latency, hash 3 inputs at a time; but not if short. Length is never 0 but can be 1. |
| 127 | + if((k-=3)<=0){ // fast path for len<=3. We think all these branches will predict correctly |
| 128 | + UI crc; crc=CRC32L(-1LL,v[0]); if(k>=-1){crc=CRC32L(crc,v[1]); if(k==0)crc=CRC32L(crc,v[2]);} R crc; |
| 129 | + } |
| 130 | + UI crc0=-1, crc1=crc0, crc2=crc0; |
| 131 | + NOUNROLL do{ |
| 132 | + crc0=CRC32L(crc0,v[0]); crc1=CRC32L(crc1,v[1]); crc2=CRC32L(crc2,v[2]); |
| 133 | + v+=3, k-=3; |
| 134 | + }while(k>=0); // at end k is negative, and we have gone through the loop origk%3 times |
| 135 | + if(k>-2){crc1=CRC32L(crc1,v[1]); crc0=CRC32L(crc0,v[0]);}else if(k==-2){crc0=CRC32L(crc0,v[0]);} |
| 136 | + RETCRC3; |
| 137 | +} |
| 138 | + |
| 139 | +// Hash a FL list, with check for -0 and +0 |
| 140 | +static UI hic0(I k, UIL* v){ |
| 141 | + // Owing to latency, hash pairs of inputs. Check each for -0 |
| 142 | + UI crc0=-1, crc1=crc0; |
| 143 | + for(k-=2;k>=0;v+=2, k-=2){ |
| 144 | + if(likely(v[0]!=(UIL)NEGATIVE0)){crc0=CRC32LL(crc0,v[0]);}else{crc0=CRC32LL(crc0,0);} |
| 145 | + if(likely(v[1]!=(UIL)NEGATIVE0)){crc1=CRC32LL(crc1,v[1]);}else{crc1=CRC32LL(crc1,0);} |
| 146 | + } |
| 147 | + if(k>-2){if(likely(v[0]!=(UIL)NEGATIVE0)){crc0=CRC32LL(crc0,v[0]);}else{crc0=CRC32LL(crc0,0);}} |
| 148 | + R CRC32L(crc0,crc1); |
| 149 | +} |
| 150 | + |
| 151 | +// Hash a pair of INTs. Not compatible with hici. Based on wyhash. |
| 152 | +static __forceinline UI hici2(UI x, UI y){ |
| 153 | + x^=0xe7037ed1a0b428db; |
| 154 | + y^=0xa0761d6478bd642f; |
| 155 | + UI l,h;DPUMULU(x,y,l,h); |
| 156 | + R (UI4)(l^h);} |
| 157 | + |
| 158 | +// Hash a single FL atom, with check for -0 and +0 |
| 159 | +#define hic01(x) ((*(UIL*)(x)!=(UIL)NEGATIVE0)?CRC32LL(-1L,*(UIL*)(x)):CRC32LL(-1L,0)) |
| 160 | + |
| 161 | +// Hashes for extended/rational types. Hash only the numerator of rationals. These are |
| 162 | +// Q and X types (Q is a pair of X types) |
| 163 | +static UI hix(X*v){A y=*v; R hici(XLIMBLEN(y),UIAV(y));} |
| 164 | +static UI hiq(Q*v){A y=v->n; R hici(XLIMBLEN(y),UIAV(y));} |
| 165 | + |
| 166 | +// Hash a single double, using only the bits in ctmask. |
| 167 | +// not required for tolerant comparison, but if we tried to do tolerant comparison through the fast code it would help |
| 168 | +static UI cthid(UIL ctmask,D d){R likely(*(UIL*)&d!=(UIL)NEGATIVE0)?CRC32LL(-1L,*(UIL*)&d&ctmask):CRC32LL(-1L,0);} |
| 169 | + |
| 170 | +// compare floats, not distinguishing -0 from +0. Return 0 if equal, 1 if not equal |
| 171 | +static __forceinline I fcmp0(D* a, D* w, I n){ |
| 172 | + DQ(n, if(a[i]!=w[i])R 1;); |
| 173 | + R 0; |
| 174 | +} |
| 175 | + |
| 176 | + |
| 177 | +#if C_AVX512 |
| 178 | +#define COMPSETUP \ |
| 179 | + __mmask8 endmask = BZHI(0xff,1+(n-1)%8); |
| 180 | +#define COMPCALL(a) icmpeq(v,(a)+n*hj,n,endmask) |
| 181 | +static __forceinline I icmpeq(I *x, I *y, I n, __mmask8 em) { |
| 182 | + if(common(n<=8))R !!_mm512_cmpneq_epi64_mask(_mm512_maskz_loadu_epi64(em,x),_mm512_maskz_loadu_epi64(em,y)); |
| 183 | + __m512i u,v; |
| 184 | + DQ((n-1)/8, |
| 185 | + u=_mm512_loadu_si512(x); |
| 186 | + v=_mm512_loadu_si512(y); |
| 187 | + x+=8;y+=8; |
| 188 | + if(_mm512_cmpneq_epi64_mask(u,v))R 1;) |
| 189 | + R !!_mm512_cmpneq_epi64_mask(_mm512_maskz_loadu_epi64(em,x),_mm512_maskz_loadu_epi64(em,y));} |
| 190 | +#elif C_AVX2 || EMU_AVX2 |
| 191 | +#define COMPSETUP \ |
| 192 | + __m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-n)&(NPAR-1)))); // mask for 0 1 2 3 4 5 is xxxx 0001 0011 0111 1111 0001 |
| 193 | +#define COMPCALL(a) icmpeq(v,(a)+n*hj,n,endmask) |
| 194 | +static __forceinline I icmpeq(I *x, I *y, I n, __m256i endmask) { |
| 195 | + __m256i u,v; __m256d ones=_mm256_castsi256_pd(_mm256_cmpeq_epi64(endmask,endmask)); |
| 196 | + I i=(n-1)>>LGNPAR; /* # loops for 0 1 2 3 4 5 is x 0 0 0 0 1 */ |
| 197 | + while(--i>=0){ |
| 198 | + u=_mm256_loadu_si256((__m256i*)x); v=_mm256_loadu_si256((__m256i*)y); x+=NPAR;y+=NPAR; if(!_mm256_testc_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(u,v)),ones))R 1; // abort on any nonequal |
| 199 | + } |
| 200 | + u=_mm256_maskload_epi64(x,endmask); v=_mm256_maskload_epi64(y,endmask); |
| 201 | + R !_mm256_testc_pd(_mm256_castsi256_pd(_mm256_cmpeq_epi64(u,v)),ones); // no miscompares, compare equal (= 0 result) |
| 202 | +} |
| 203 | +#else |
| 204 | +#define COMPSETUP |
| 205 | +#define COMPCALL(a) icmpeq(v,(a)+n*hj,n) |
| 206 | +// Return nonzero if *a!=*w |
| 207 | +static __forceinline I icmpeq(I *a, I *w, I n) { |
| 208 | + if(n)do{ |
| 209 | + if(*a!=*w)R n; |
| 210 | + if(--n)++a,++w; else R n; |
| 211 | + }while(1); |
| 212 | + R n; |
| 213 | +} |
| 214 | +#endif |
| 215 | + |
| 216 | +// I flip-flopped for a while between this and a pair of scalar compares. Decided to go with this: lower reg pressure, better predictability |
| 217 | +#if C_AVX512 |
| 218 | +#define cmpi2(x,y) ({ !!_mm_cmpneq_epi64_mask(_mm_loadu_si128((__m128i*)(x)),_mm_loadu_si128((__m128i*)(y))); }) |
| 219 | +#else |
| 220 | +#define cmpi2(x,y) ({ __m128i _cmpres=_mm_cmpeq_epi32(_mm_loadu_si128((__m128i*)(x)),_mm_loadu_si128((__m128i*)(y))); !_mm_testc_si128(_cmpres,_mm_setone_si128()); }) |
| 221 | +#endif |
| 222 | + |
| 223 | +static inline B jeqd(I n,D*u,D*v,D cct){DQ(n, if(!TCMPEQ(cct,*u,*v))R 0; ++u; ++v;); R 1;} |
| 224 | +static inline B jteqz(J jt,I n,Z*u,Z*v){DQ(n, if(!zeq(*u,*v))R 0; ++u; ++v;); R 1;} |
| 225 | + |
| 226 | + |
| 227 | +// create a mask of bits in which a difference is considered significant for floating-point purposes. |
| 228 | +// cct is complementary comparison tolerance; return |
| 229 | +static UIL calcctmask(D cct){ |
| 230 | + // New version: we have to find the max maskvalue such that x+tolerance and x-tolerance are not separated by more than one |
| 231 | + // maskpoint, for any x. The worst-case x occurs where the mantissa of x is as big as it can be, e. g. 1.1111111111... |
| 232 | + // At that point the tolerance band above is t/(t+1) times the approx value (2 in the example). The tolerance band below is about the same, |
| 233 | + // and the sum of the two must not exceed the mask size. We calculate the mask in floating point as 2 - 2 * 2*(t/(t+1)) which will |
| 234 | + // give its floating-point representation. We then adjust the mask by forcing the exponent to be masked, and clearing any bits below the |
| 235 | + // highest clear bit |
| 236 | + if(likely(cct==1.0-FUZZ))R (UIL)0xfffffffffffffc00LL; // default ct |
| 237 | + if(likely(cct==1.0))R ~0LL; // intolerant ct |
| 238 | + // user specified ct. Calculate the mask for it |
| 239 | + D p=2.0 - (4.0*(1.0-cct))/(2.0-cct); |
| 240 | + UIL q=(~*(UIL*)&p)<<16; // shift exponent away & complement |
| 241 | + I zeropos; CTLZI(q,zeropos); // get bit# of highest 0 bit (+ 16, because of shift) - 15 if all bits were 1 |
| 242 | + R ((UIL)~0LL)<<(zeropos-15); |
| 243 | +} |
| 244 | + |
| 245 | + |
| 246 | +// prototypes for implementations in viavx?.c |
| 247 | +IOF(jtioax1); IOF(jtioau); IOF(jtiox); IOF(jtioq); IOF(jtioc); IOF(jtioi); IOF(jtioC2); IOF(jtioC4); IOF(jtioi1); IOF(jtio16); IOF(jtioc01); IOF(jtioz01); IOF(jtioc0); IOF(jtioz0); |
| 248 | +IOF(jtioax12); IOF(jtioau2); IOF(jtiox2); IOF(jtioq2); IOF(jtioc2); IOF(jtioi2); IOF(jtioC22); IOF(jtioC42); IOF(jtioi12); IOF(jtio162); IOF(jtioc012); IOF(jtioz012); IOF(jtioc02); IOF(jtioz02); |
| 249 | +IOF(jtioz); IOF(jtioz1); IOF(jtiod); IOF(jtiod1); IOF(jtioa); IOF(jtioa1); |
| 250 | +IOF(jtioz2); IOF(jtioz12); IOF(jtiod2); IOF(jtiod12); IOF(jtioa2); IOF(jtioa12); |
| 251 | +IOF(jtio12); IOF(jtio22); IOF(jtio42); IOF(jtio82); |
| 252 | +IOF(jtio14); IOF(jtio24); IOF(jtio44); IOF(jtio84); |
| 253 | +A jtiosc(J jt,I mode,I n,I asct,I wsct,I ac,I wc,A a,A w,A z); |
| 254 | +IOF(jtiobs); |
| 255 | +IOF(jtiowax1); IOF(jtiowau); IOF(jtiowx); IOF(jtiowq); IOF(jtiowc); IOF(jtiowi); IOF(jtiow21); IOF(jtiow41); IOF(jtiowi1); IOF(jtiow161); IOF(jtiowc01); IOF(jtiowz01); IOF(jtiowc0); IOF(jtiowz0); |
| 256 | +IOF(jtiowax12); IOF(jtiowau2); IOF(jtiowx2); IOF(jtiowq2); IOF(jtiowc2); IOF(jtiowi2); IOF(jtiow212); IOF(jtiow412); IOF(jtiowi12); IOF(jtiow1612); IOF(jtiowc012); IOF(jtiowz012); IOF(jtiowc02); IOF(jtiowz02); |
0 commit comments