@@ -1628,6 +1628,15 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
1628
1628
/* Avoid malloc for small temp arrays. */
1629
1629
#define MERGESTATE_TEMP_SIZE 256
1630
1630
1631
+ /* The largest value of minrun. This must be a power of 2, and >= 1, so that
1632
+ * the compute_minrun() algorithm guarantees to return a result no larger than
1633
+ * this,
1634
+ */
1635
+ #define MAX_MINRUN 64
1636
+ #if ((MAX_MINRUN ) < 1 ) || ((MAX_MINRUN ) & ((MAX_MINRUN ) - 1 ))
1637
+ #error "MAX_MINRUN must be a power of 2, and >= 1"
1638
+ #endif
1639
+
1631
1640
/* One MergeState exists on the stack per invocation of mergesort. It's just
1632
1641
* a convenient way to pass state around among the helper functions.
1633
1642
*/
@@ -1685,68 +1694,133 @@ struct s_MergeState {
1685
1694
int (* tuple_elem_compare )(PyObject * , PyObject * , MergeState * );
1686
1695
};
1687
1696
1688
- /* binarysort is the best method for sorting small arrays: it does
1689
- few compares, but can do data movement quadratic in the number of
1690
- elements.
1691
- [lo.keys, hi) is a contiguous slice of a list of keys, and is sorted via
1692
- binary insertion. This sort is stable.
1693
- On entry, must have lo.keys <= start <= hi, and that
1694
- [lo.keys, start) is already sorted (pass start == lo.keys if you don't
1695
- know!).
1696
- If islt() complains return -1, else 0.
1697
+ /* binarysort is the best method for sorting small arrays: it does few
1698
+ compares, but can do data movement quadratic in the number of elements.
1699
+ ss->keys is viewed as an array of n kays, a[:n]. a[:ok] is already sorted.
1700
+ Pass ok = 0 (or 1) if you don't know.
1701
+ It's sorted in-place, by a stable binary insertion sort. If ss->values
1702
+ isn't NULL, it's permuted in lockstap with ss->keys.
1703
+ On entry, must have n >= 1, and 0 <= ok <= n <= MAX_MINRUN.
1704
+ Return -1 if comparison raises an exception, else 0.
1697
1705
Even in case of error, the output slice will be some permutation of
1698
1706
the input (nothing is lost or duplicated).
1699
1707
*/
1700
1708
static int
1701
- binarysort (MergeState * ms , sortslice lo , PyObject * * hi , PyObject * * start )
1709
+ binarysort (MergeState * ms , const sortslice * ss , Py_ssize_t n , Py_ssize_t ok )
1702
1710
{
1703
- Py_ssize_t k ;
1704
- PyObject * * l , * * p , * * r ;
1711
+ Py_ssize_t k ; /* for IFLT macro expansion */
1712
+ PyObject * * const a = ss -> keys ;
1713
+ PyObject * * const v = ss -> values ;
1714
+ const bool has_values = v != NULL ;
1705
1715
PyObject * pivot ;
1706
-
1707
- assert (lo .keys <= start && start <= hi );
1708
- /* assert [lo.keys, start) is sorted */
1709
- if (lo .keys == start )
1710
- ++ start ;
1711
- for (; start < hi ; ++ start ) {
1712
- /* set l to where *start belongs */
1713
- l = lo .keys ;
1714
- r = start ;
1715
- pivot = * r ;
1716
- /* Invariants:
1717
- * pivot >= all in [lo.keys, l).
1718
- * pivot < all in [r, start).
1719
- * These are vacuously true at the start.
1716
+ Py_ssize_t M ;
1717
+
1718
+ assert (0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN );
1719
+ /* assert a[:ok] is sorted */
1720
+ if (! ok )
1721
+ ++ ok ;
1722
+ /* Regular insertion sort has average- and worst-case O(n**2) cost
1723
+ for both # of comparisons and number of bytes moved. But its branches
1724
+ are highly predictable, and it loves sorted input (n-1 compares and no
1725
+ data movement). This is significant in cases like sortperf.py's %sort,
1726
+ where an out-of-order element near the start of a run is moved into
1727
+ place slowly but then the remaining elements up to length minrun are
1728
+ generally at worst one slot away from their correct position (so only
1729
+ need 1 or 2 commpares to resolve). If comparisons are very fast (such
1730
+ as for a list of Python floats), the simple inner loop leaves it
1731
+ very competitive with binary insertion, despite that it does
1732
+ significantly more compares overall on random data.
1733
+
1734
+ Binary insertion sort has worst, average, and best case O(n log n)
1735
+ cost for # of comparisons, but worst and average case O(n**2) cost
1736
+ for data movement. The more expensive comparisons, the more important
1737
+ the comparison advantage. But its branches are less predictable the
1738
+ more "randomish" the data, and that's so significant its worst case
1739
+ in real life is random input rather than reverse-ordered (which does
1740
+ about twice the data movement than random input does).
1741
+
1742
+ Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
1743
+ of 64 is so small that the key and value pointers all fit in a corner
1744
+ of L1 cache, and moving things around in that is very fast. */
1745
+ #if 0 // ordinary insertion sort.
1746
+ PyObject * vpivot = NULL ;
1747
+ for (; ok < n ; ++ ok ) {
1748
+ pivot = a [ok ];
1749
+ if (has_values )
1750
+ vpivot = v [ok ];
1751
+ for (M = ok - 1 ; M >= 0 ; -- M ) {
1752
+ k = ISLT (pivot , a [M ]);
1753
+ if (k < 0 ) {
1754
+ a [M + 1 ] = pivot ;
1755
+ if (has_values )
1756
+ v [M + 1 ] = vpivot ;
1757
+ goto fail ;
1758
+ }
1759
+ else if (k ) {
1760
+ a [M + 1 ] = a [M ];
1761
+ if (has_values )
1762
+ v [M + 1 ] = v [M ];
1763
+ }
1764
+ else
1765
+ break ;
1766
+ }
1767
+ a [M + 1 ] = pivot ;
1768
+ if (has_values )
1769
+ v [M + 1 ] = vpivot ;
1770
+ }
1771
+ #else // binary insertion sort
1772
+ Py_ssize_t L , R ;
1773
+ for (; ok < n ; ++ ok ) {
1774
+ /* set L to where a[ok] belongs */
1775
+ L = 0 ;
1776
+ R = ok ;
1777
+ pivot = a [ok ];
1778
+ /* Slice invariants. vacuously true at the start:
1779
+ * all a[0:L] <= pivot
1780
+ * all a[L:R] unknown
1781
+ * all a[R:ok] > pivot
1720
1782
*/
1721
- assert (l < r );
1783
+ assert (L < R );
1722
1784
do {
1723
- p = l + ((r - l ) >> 1 );
1724
- IFLT (pivot , * p )
1725
- r = p ;
1785
+ /* don't do silly ;-) things to prevent overflow when finding
1786
+ the midpoint; L and R are very far from filling a Py_ssize_t */
1787
+ M = (L + R ) >> 1 ;
1788
+ #if 1 // straightforward, but highly unpredictable branch on random data
1789
+ IFLT (pivot , a [M ])
1790
+ R = M ;
1726
1791
else
1727
- l = p + 1 ;
1728
- } while (l < r );
1729
- assert (l == r );
1730
- /* The invariants still hold, so pivot >= all in [lo.keys, l) and
1731
- pivot < all in [l, start), so pivot belongs at l. Note
1732
- that if there are elements equal to pivot, l points to the
1733
- first slot after them -- that's why this sort is stable.
1734
- Slide over to make room.
1735
- Caution: using memmove is much slower under MSVC 5;
1736
- we're not usually moving many slots. */
1737
- for (p = start ; p > l ; -- p )
1738
- * p = * (p - 1 );
1739
- * l = pivot ;
1740
- if (lo .values != NULL ) {
1741
- Py_ssize_t offset = lo .values - lo .keys ;
1742
- p = start + offset ;
1743
- pivot = * p ;
1744
- l += offset ;
1745
- for ( ; p > l ; -- p )
1746
- * p = * (p - 1 );
1747
- * l = pivot ;
1792
+ L = M + 1 ;
1793
+ #else
1794
+ /* Try to get compiler to generate conditional move instructions
1795
+ instead. Works fine, but leaving it disabled for now because
1796
+ it's not yielding consistently faster sorts. Needs more
1797
+ investigation. More computation in the inner loop adds its own
1798
+ costs, which can be significant when compares are fast. */
1799
+ k = ISLT (pivot , a [M ]);
1800
+ if (k < 0 )
1801
+ goto fail ;
1802
+ Py_ssize_t Mp1 = M + 1 ;
1803
+ R = k ? M : R ;
1804
+ L = k ? L : Mp1 ;
1805
+ #endif
1806
+ } while (L < R );
1807
+ assert (L == R );
1808
+ /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
1809
+ at index L. Slide a[L:ok] to the right a slot to make room for it.
1810
+ Caution: using memmove is much slower under MSVC 5; we're not
1811
+ usually moving many slots. Years later: under Visual Studio 2022,
1812
+ memmove seems just slightly slower than doing it "by hand". */
1813
+ for (M = ok ; M > L ; -- M )
1814
+ a [M ] = a [M - 1 ];
1815
+ a [L ] = pivot ;
1816
+ if (has_values ) {
1817
+ pivot = v [ok ];
1818
+ for (M = ok ; M > L ; -- M )
1819
+ v [M ] = v [M - 1 ];
1820
+ v [L ] = pivot ;
1748
1821
}
1749
1822
}
1823
+ #endif // pick binary or regular insertion sort
1750
1824
return 0 ;
1751
1825
1752
1826
fail :
@@ -2559,10 +2633,10 @@ merge_force_collapse(MergeState *ms)
2559
2633
/* Compute a good value for the minimum run length; natural runs shorter
2560
2634
* than this are boosted artificially via binary insertion.
2561
2635
*
2562
- * If n < 64, return n (it's too small to bother with fancy stuff).
2563
- * Else if n is an exact power of 2, return 32 .
2564
- * Else return an int k, 32 <= k <= 64 , such that n/k is close to, but
2565
- * strictly less than, an exact power of 2.
2636
+ * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff).
2637
+ * Else if n is an exact power of 2, return MAX_MINRUN / 2 .
2638
+ * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN , such that n/k is
2639
+ * close to, but strictly less than, an exact power of 2.
2566
2640
*
2567
2641
* See listsort.txt for more info.
2568
2642
*/
@@ -2572,7 +2646,7 @@ merge_compute_minrun(Py_ssize_t n)
2572
2646
Py_ssize_t r = 0 ; /* becomes 1 if any 1 bits are shifted off */
2573
2647
2574
2648
assert (n >= 0 );
2575
- while (n >= 64 ) {
2649
+ while (n >= MAX_MINRUN ) {
2576
2650
r |= n & 1 ;
2577
2651
n >>= 1 ;
2578
2652
}
@@ -2956,7 +3030,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
2956
3030
if (n < minrun ) {
2957
3031
const Py_ssize_t force = nremaining <= minrun ?
2958
3032
nremaining : minrun ;
2959
- if (binarysort (& ms , lo , lo . keys + force , lo . keys + n ) < 0 )
3033
+ if (binarysort (& ms , & lo , force , n ) < 0 )
2960
3034
goto fail ;
2961
3035
n = force ;
2962
3036
}
0 commit comments