You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// encode l as 00 r as 11 m as 01/10, the 8 possibilities are mr-ll-rm (LE) -> 01 11 10 00 00 01 11 10 -> 0 1 1 0 0 0 1 1 0 (LE) -> 011000110 0xc6
50
51
Ipivotx=((pivotcomp&1?rr:ll)+(pivotcomp&2?rr:ll))>>1; pivot=v[pivotx]; v[pivotx]=v[r]; // pick the median pivot, swap it (notionally) with the last
51
52
}
52
53
#ifSORTQCOND
@@ -63,7 +64,7 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
63
64
SORTQTYPE*v0=v+l; // base of the partitioned region
64
65
UIcstk=0; // will hold comparison results. Init to 0 for MSBs so as not to interfere with finding highest 1
65
66
#ifSORTQCOND
66
-
// go back to front, shifting bits up from the bottom
67
+
// go back to front, shifting bits up from the bottom. A 1 bit means the value should go to cide 0
67
68
{
68
69
__m256iendmask;
69
70
endmask=_mm256_loadu_si256((__m256i*)(validitymask+((-(r-l))&(NPAR-1)))); /* mask for 00=1111, 01=1000, 10=1100, 11=1110 */
@@ -83,17 +84,17 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
83
84
// the upper partition and we will enter a worst-case where we partition only one item each pass. To prevent that, we will repartition using the same pivot, but
84
85
// this time moving the equal values to the lower partition. If this produces a better partition, we will use it. If not, we must be processing a block of
85
86
// ALL equal values, and we stop partitioning. We thus process a block with many equals as follows: scan & partition to strip elements lower than the equals; scan, rescan, & partition to
86
-
// strip elements higher than the equals; scan & rescan to detect the all-equals case & stop partitioning
87
+
// strip elements higher than the equals; scan & rescan to detect the all-equals case & stop partitioning scaf* replace this with the pivot-patitioning below
87
88
DQ(r-l, cstk=2*cstk+(v0[i]<=pivot);) // this time equality moves to the lower side (cstk is already 0 to start)
88
89
// There MUST have been at least 1 equal value, because the pivot was the median of three and yet nothing compared low the first time; one value at least must compare equal.
89
90
if(!(cstk&(cstk+1))) {
90
91
// There are still no exchanges. Find the partition sizes.
91
92
UI4xchgx04=CTLZI(cstk); xchgx0=xchgx04; xchgx1=xchgx0+1; // low partition always ends right below the high - no middle partition
92
93
if(xchgx1!=r-l)goto finmedxchg; // there are no exchanges, but we have to process both partitions (one of which might be empty)
93
-
r=l; continue; // all equal -- abort this partition and go to the next one
94
+
goto batchfinished; // all equal -- abort this partition and go to the next one
94
95
}
95
96
// ...falling through if there are exchanges after a rescan
96
-
}else{
97
+
}else{
97
98
// The lower partition is not empty. Set the partition pointer above the lowest 1-bit
98
99
UI4xchgx04=CTLZI(cstk); xchgx0=xchgx04; xchgx1=xchgx0+1; // low partition always ends right below the high - no middle partition
99
100
goto finmedxchg;
@@ -159,7 +160,7 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
159
160
Ibittofill=BW-ncmp1; // get the running index of where we will put new bits
160
161
SORTQTYPE*vv=v+in1-BW-(NPAR-1); // pointer to beginning of the 4-word section ending at in1
161
162
// fill 4-bit sections up to the last, which will be 1-4 bits
162
-
// if AVX2, use permute4x rather than the 2 insts here
163
+
// scaf* if AVX2, use permute4x rather than the 2 insts here
// if the partitioning is very bad, it is probably because the partition is almost all the same value, which is the pivot. Since pivots can go to either side, one side may move much faster - 64x faster - than the other,
204
+
// leaving an imbalance. To ameliorate the problem, we go through the longer side in this case, swapping the pivots to the middle of the partition
205
+
if(unlikely((MAX(lenl,lenr)>>3)>MIN(lenl,lenr))){ // check is one side 8x larger than the other. We will abort the copy if the long side is not >50% pivots
206
+
if(lenl>lenr){
207
+
// left is much larger than right. Move values that equal the pivot to the center partition, where they will stay undisturbed. Make sure the partition has a lot of pivot values
208
+
DQ(lenl+1, xchgx0=i; if(v[l+i]!=pivot)break;) xchgx0+=l; // first discard trailing pivot values, leaving xchgx0 at a non-pivot
209
+
DQ(xchgx0-l+1, Irdx=l+i; rdx=v[rdx]==pivot?xchgx0:rdx; xchgx0-=v[l+i]==pivot; v[l+i]=v[rdx]; if(xchgx0-(l+i)>(xchgx1+10)-xchgx0)break;) // if [i]=pivot, move[xchgx0] down, else rewrite in place; if # not moved exceeds 10+#moved, abort, it's not lopsided enough
210
+
// now xchgx0 is the index of the end of the left partition (could be l-1)
211
+
DQ((xchgx1-1)-(xchgx0+1), v[xchgx0+1+i]=pivot;) // install pivots from xchgx0+1 to before xchgx1-1. This creates a middle partition from the pivots
212
+
lenl=xchgx0-l; // shorten the left partition to exclude the middles
213
+
}else{
214
+
// similarly, when the right partition is the larger
215
+
Inonpivx; DO(lenr+1, nonpivx=i; if(v[xchgx1+i]!=pivot)break;) xchgx1=nonpivx+=xchgx1; // first discard leading pivot values, leaving nonpivx and xchgx1 at a non-pivot. xchgx1 will always point to the first unswapped non-pivot
216
+
DO(r-nonpivx+1, Irdx=nonpivx+i; rdx=v[rdx]==pivot?xchgx1:rdx; xchgx1+=v[nonpivx+i]==pivot; v[nonpivx+i]=v[rdx]; if((nonpivx+i)-xchgx1>(xchgx1+10)-xchgx0)break;) // if [i]=pivot, move[xchgx1] up, else rewrite in place; if # not moved exceeds 10+#moved, abort, it's not lopsided enough
217
+
// now xchgx0 is the index of the end of the left partition (could be l-1)
218
+
DO(xchgx1-nonpivx, v[nonpivx+i]=pivot;) // install pivots from xchgx0+1 to before xchgx1-1. This creates a middle partition from the pivots
219
+
lenr=r-xchgx1; // shorten the right partition to exclude the middles
220
+
}
221
+
}
222
+
223
+
// push stack for the larger partition; modify batch pointers for the smaller
203
224
// make l,r the smaller partition and l0,r0 the larger; then stack l0,r0
0 commit comments