Skip to content

Commit faac016

Browse files
committed
maladroit partitioning when many repeated values
1 parent f0cf512 commit faac016

1 file changed

Lines changed: 32 additions & 11 deletions

File tree

jsrc/vgsortq.h

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
3535
case -1: case 0:;
3636
}
3737
// that batch is sorted; move to the next
38+
batchfinished: ; // come here if the entire partition is known to be in order
3839
if(--stackp<0)R; // back up stack; if we're finishing the last call, we're through
3940
l=stack[stackp][0]; r=stack[stackp][1]; // resume the next
4041
}
@@ -44,9 +45,9 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
4445
{
4546
// actually we advance the first/last to quartile positions, to avoid a worst-case when an almost-sorted list has some very small numbers
4647
// added to the end. In that case a small value is chosen as the pivot and replaced at the end with another small value
47-
I ll=l+((r-l)>>2), rr=r-((r-l)>>2);
48-
I pivotcomp = 0xc6>>(4*(v[ll]>v[(ll+rr)>>1]) + 2*(v[ll]>v[rr]) + (v[(ll+rr)>>1]>v[rr])); // l>m, l>r, m>r 000 lmr 001 lrm 010 - 011 rlm 100 mlr 101 - 110 mrl 111 rml
49-
// encode l as 00 r as 11 m as 01/10, the sequence is mr-ll-rm 01 11 10 00 00 01 11 10 0 1 1 0 0 0 1 1 0 (LE) -> 011000110 0xc6
48+
I ll=l+((r-l)>>2), rr=r-((r-l)>>2), mm=(rr+ll)>>1; // we don't worry about overflow - we need to match the mm computation below
49+
I pivotcomp = 0xc6>>(4*(v[ll]>v[mm]) + 2*(v[ll]>v[rr]) + (v[mm]>v[rr])); // l>m, l>r, m>r 000 lmr 001 lrm 010 - 011 rlm 100 mlr 101 - 110 mrl 111 rml
50+
// encode l as 00 r as 11 m as 01/10, the 8 possibilities are mr-ll-rm (LE) -> 01 11 10 00 00 01 11 10 -> 0 1 1 0 0 0 1 1 0 (LE) -> 011000110 0xc6
5051
I pivotx=((pivotcomp&1?rr:ll)+(pivotcomp&2?rr:ll))>>1; pivot=v[pivotx]; v[pivotx]=v[r]; // pick the median pivot, swap it (notionally) with the last
5152
}
5253
#if SORTQCOND
@@ -63,7 +64,7 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
6364
SORTQTYPE *v0=v+l; // base of the partitioned region
6465
UI cstk=0; // will hold comparison results. Init to 0 for MSBs so as not to interfere with finding highest 1
6566
#if SORTQCOND
66-
// go back to front, shifting bits up from the bottom
67+
// go back to front, shifting bits up from the bottom. A 1 bit means the value should go to cide 0
6768
{
6869
__m256i endmask;
6970
endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-(r-l))&(NPAR-1)))); /* mask for 00=1111, 01=1000, 10=1100, 11=1110 */
@@ -83,17 +84,17 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
8384
// the upper partition and we will enter a worst-case where we partition only one item each pass. To prevent that, we will repartition using the same pivot, but
8485
// this time moving the equal values to the lower partition. If this produces a better partition, we will use it. If not, we must be processing a block of
8586
// ALL equal values, and we stop partitioning. We thus process a block with many equals as follows: scan & partition to strip elements lower than the equals; scan, rescan, & partition to
86-
// strip elements higher than the equals; scan & rescan to detect the all-equals case & stop partitioning
87+
// strip elements higher than the equals; scan & rescan to detect the all-equals case & stop partitioning scaf* replace this with the pivot-patitioning below
8788
DQ(r-l, cstk=2*cstk+(v0[i]<=pivot);) // this time equality moves to the lower side (cstk is already 0 to start)
8889
// There MUST have been at least 1 equal value, because the pivot was the median of three and yet nothing compared low the first time; one value at least must compare equal.
8990
if(!(cstk&(cstk+1))) {
9091
// There are still no exchanges. Find the partition sizes.
9192
UI4 xchgx04=CTLZI(cstk); xchgx0=xchgx04; xchgx1=xchgx0+1; // low partition always ends right below the high - no middle partition
9293
if(xchgx1!=r-l)goto finmedxchg; // there are no exchanges, but we have to process both partitions (one of which might be empty)
93-
r=l; continue; // all equal -- abort this partition and go to the next one
94+
goto batchfinished; // all equal -- abort this partition and go to the next one
9495
}
9596
// ...falling through if there are exchanges after a rescan
96-
} else {
97+
}else{
9798
// The lower partition is not empty. Set the partition pointer above the lowest 1-bit
9899
UI4 xchgx04=CTLZI(cstk); xchgx0=xchgx04; xchgx1=xchgx0+1; // low partition always ends right below the high - no middle partition
99100
goto finmedxchg;
@@ -159,7 +160,7 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
159160
I bittofill=BW-ncmp1; // get the running index of where we will put new bits
160161
SORTQTYPE *vv=v+in1-BW-(NPAR-1); // pointer to beginning of the 4-word section ending at in1
161162
// fill 4-bit sections up to the last, which will be 1-4 bits
162-
// if AVX2, use permute4x rather than the 2 insts here
163+
// scaf* if AVX2, use permute4x rather than the 2 insts here
163164
while(bittofill<BW-NPAR){cstk1=cstk1+((I)_mm256_movemask_pd(_mm256_permute_pd(_mm256_permute2f128_pd(SORTQCASTTOPD(SORTQCMP256(pivot256,SORTQULOAD((SORTQULOADTYPE)vv) SORTQCMPTYPE)),SORTQCASTTOPD(pivot256),0x01),0x5))<<bittofill); vv-=NPAR; bittofill+=NPAR;}
164165
cstk1=cstk1+((I)_mm256_movemask_pd(_mm256_permute_pd(_mm256_permute2f128_pd(SORTQCASTTOPD(SORTQCMP256(pivot256,SORTQMASKLOAD(vv,endmask) SORTQCMPTYPE)),SORTQCASTTOPD(pivot256),0x01),0x5))<<bittofill);
165166

@@ -195,11 +196,31 @@ SORTQSCOPE void SORTQNAME(SORTQTYPE *v, I n){
195196

196197
// exchange the end of the array with the start of the right side
197198
v[r]=v[xchgx1]; v[xchgx1]=pivot;
198-
199-
// push stack for the larger partition; modify batch pointers for the smaller
200199
// recursions are l..xchgx0 and xchgx1+1..r the +1 to step over the pivot from this pass
201200
++xchgx1; // the upper partition starts AFTER the pivot
202-
I lenl=xchgx0-l, lenr=r-xchgx1, l0=l, r0=r;
201+
I lenl=xchgx0-l, lenr=r-xchgx1, l0=l, r0=r; // actually, len-1
202+
203+
// if the partitioning is very bad, it is probably because the partition is almost all the same value, which is the pivot. Since pivots can go to either side, one side may move much faster - 64x faster - than the other,
204+
// leaving an imbalance. To ameliorate the problem, we go through the longer side in this case, swapping the pivots to the middle of the partition
205+
if(unlikely((MAX(lenl,lenr)>>3)>MIN(lenl,lenr))){ // check is one side 8x larger than the other. We will abort the copy if the long side is not >50% pivots
206+
if(lenl>lenr){
207+
// left is much larger than right. Move values that equal the pivot to the center partition, where they will stay undisturbed. Make sure the partition has a lot of pivot values
208+
DQ(lenl+1, xchgx0=i; if(v[l+i]!=pivot)break;) xchgx0+=l; // first discard trailing pivot values, leaving xchgx0 at a non-pivot
209+
DQ(xchgx0-l+1, I rdx=l+i; rdx=v[rdx]==pivot?xchgx0:rdx; xchgx0-=v[l+i]==pivot; v[l+i]=v[rdx]; if(xchgx0-(l+i)>(xchgx1+10)-xchgx0)break;) // if [i]=pivot, move[xchgx0] down, else rewrite in place; if # not moved exceeds 10+#moved, abort, it's not lopsided enough
210+
// now xchgx0 is the index of the end of the left partition (could be l-1)
211+
DQ((xchgx1-1)-(xchgx0+1), v[xchgx0+1+i]=pivot;) // install pivots from xchgx0+1 to before xchgx1-1. This creates a middle partition from the pivots
212+
lenl=xchgx0-l; // shorten the left partition to exclude the middles
213+
}else{
214+
// similarly, when the right partition is the larger
215+
I nonpivx; DO(lenr+1, nonpivx=i; if(v[xchgx1+i]!=pivot)break;) xchgx1=nonpivx+=xchgx1; // first discard leading pivot values, leaving nonpivx and xchgx1 at a non-pivot. xchgx1 will always point to the first unswapped non-pivot
216+
DO(r-nonpivx+1, I rdx=nonpivx+i; rdx=v[rdx]==pivot?xchgx1:rdx; xchgx1+=v[nonpivx+i]==pivot; v[nonpivx+i]=v[rdx]; if((nonpivx+i)-xchgx1>(xchgx1+10)-xchgx0)break;) // if [i]=pivot, move[xchgx1] up, else rewrite in place; if # not moved exceeds 10+#moved, abort, it's not lopsided enough
217+
// now xchgx0 is the index of the end of the left partition (could be l-1)
218+
DO(xchgx1-nonpivx, v[nonpivx+i]=pivot;) // install pivots from xchgx0+1 to before xchgx1-1. This creates a middle partition from the pivots
219+
lenr=r-xchgx1; // shorten the right partition to exclude the middles
220+
}
221+
}
222+
223+
// push stack for the larger partition; modify batch pointers for the smaller
203224
// make l,r the smaller partition and l0,r0 the larger; then stack l0,r0
204225
l0=lenl>lenr?l0:xchgx1; l=lenl>lenr?xchgx1:l; r0=lenl>lenr?xchgx0:r0; r=lenl>lenr?r:xchgx0;
205226
stack[stackp][0]=l0; stack[stackp][1]=r0; ++stackp;

0 commit comments

Comments
 (0)