From 855277bfb829deb7fa4b84640abe7dba60147bec Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 May 2023 14:10:11 -0700 Subject: [PATCH 1/3] ENH: Add sort keyword to stack --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/frame.py | 8 +++-- pandas/core/reshape/reshape.py | 22 ++++++++----- pandas/tests/frame/test_stack_unstack.py | 42 ++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 1c0798e6cf9b1..73e774c170c75 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -97,6 +97,7 @@ Other enhancements - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) +- :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 19564afc41b49..b15e26dfbc642 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8995,7 +8995,7 @@ def pivot_table( sort=sort, ) - def stack(self, level: Level = -1, dropna: bool = True): + def stack(self, level: Level = -1, dropna: bool = True, sort: bool = True): """ Stack the prescribed level(s) from columns to index. @@ -9021,6 +9021,8 @@ def stack(self, level: Level = -1, dropna: bool = True): axis can create combinations of index and column values that are missing from the original dataframe. See Examples section. + sort : bool, default True + Whether to sort the levels of the resulting MultiIndex. Returns ------- @@ -9164,9 +9166,9 @@ def stack(self, level: Level = -1, dropna: bool = True): ) if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna) + result = stack_multiple(self, level, dropna=dropna, sort=sort) else: - result = stack(self, level, dropna=dropna) + result = stack(self, level, dropna=dropna, sort=sort) return result.__finalize__(self, method="stack") diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 65fd9137313f1..c657ce96358ed 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -28,6 +28,7 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos +from pandas.core.algorithms import unique from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame @@ -545,7 +546,7 @@ def _unstack_extension_series(series: Series, level, fill_value) -> DataFrame: return result -def stack(frame: DataFrame, level=-1, dropna: bool = True): +def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index @@ -567,7 +568,9 @@ def factorize(index): level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): - return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) + return _stack_multi_columns( + frame, level_num=level_num, dropna=dropna, sort=sort + ) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] @@ -620,13 +623,13 @@ def factorize(index): return frame._constructor_sliced(new_values, index=new_index) -def stack_multiple(frame: DataFrame, level, dropna: bool = True): +def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True): # If all passed levels match up to column names, no # ambiguity about what to do if all(lev in frame.columns.names for lev in level): result = frame for lev in level: - result = stack(result, lev, dropna=dropna) + result = stack(result, lev, dropna=dropna, sort=sort) # Otherwise, level numbers may change as each successive level is stacked elif all(isinstance(lev, int) for lev in level): @@ -639,7 +642,7 @@ def stack_multiple(frame: DataFrame, level, dropna: bool = True): while level: lev = level.pop(0) - result = stack(result, lev, dropna=dropna) + result = stack(result, lev, dropna=dropna, sort=sort) # Decrement all level numbers greater than current, as these # have now shifted down by one level = [v if v <= lev else v - 1 for v in level] @@ -681,7 +684,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: def _stack_multi_columns( - frame: DataFrame, level_num: int = -1, dropna: bool = True + frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True ) -> DataFrame: def _convert_level_number(level_num: int, columns: Index): """ @@ -711,7 +714,7 @@ def _convert_level_number(level_num: int, columns: Index): roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = mi_cols = roll_columns - if not mi_cols._is_lexsorted(): + if not mi_cols._is_lexsorted() and sort: # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level @@ -725,7 +728,10 @@ def _convert_level_number(level_num: int, columns: Index): # time to ravel the values new_data = {} level_vals = mi_cols.levels[-1] - level_codes = sorted(set(mi_cols.codes[-1])) + level_codes = unique(mi_cols.codes[-1]) + if sort: + level_codes = sorted(level_codes) + # level_codes = sorted(set(mi_cols.codes[-1])) level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 889c44522f7bb..739b9719a57f5 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1357,6 +1357,48 @@ def test_unstack_non_slice_like_blocks(using_array_manager): tm.assert_frame_equal(res, expected) +def test_stack_sort_false(): + # GH 15105 + data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]] + df = DataFrame( + data, + columns=MultiIndex( + levels=[["B", "A"], ["x", "y"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + result = df.stack(level=0, sort=False) + expected = DataFrame( + {"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]}, + index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]), + ) + tm.assert_frame_equal(result, expected) + + # Codes sorted in this call + df = DataFrame( + data, + columns=MultiIndex.from_arrays([["B", "B", "A", "A"], ["x", "y", "x", "y"]]), + ) + result = df.stack(level=0, sort=False) + tm.assert_frame_equal(result, expected) + + +def test_stack_sort_false_multi_level(): + # GH 15105 + idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")]) + df = DataFrame([[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=idx) + result = df.stack([0, 1], sort=False) + expected_index = MultiIndex.from_tuples( + [ + ("cat", "weight", "kg"), + ("cat", "height", "m"), + ("dog", "weight", "kg"), + ("dog", "height", "m"), + ] + ) + expected = Series([1.0, 2.0, 3.0, 4.0], index=expected_index) + tm.assert_series_equal(result, expected) + + class TestStackUnstackMultiLevel: def test_unstack(self, multiindex_year_month_day_dataframe_random_data): # just check that it works for now From 2b333b94b6a1d772ae19c829e622e979877f1fda Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 May 2023 14:12:09 -0700 Subject: [PATCH 2/3] Removed commented --- pandas/core/reshape/reshape.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c657ce96358ed..589e121f9fe82 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -731,7 +731,6 @@ def _convert_level_number(level_num: int, columns: Index): level_codes = unique(mi_cols.codes[-1]) if sort: level_codes = sorted(level_codes) - # level_codes = sorted(set(mi_cols.codes[-1])) level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) From 66688c1e0b0b85e110a096de4f336df1dc38daff Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 May 2023 10:48:22 -0700 Subject: [PATCH 3/3] Use np.sort --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 589e121f9fe82..b61cdf4ad6992 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -730,7 +730,7 @@ def _convert_level_number(level_num: int, columns: Index): level_vals = mi_cols.levels[-1] level_codes = unique(mi_cols.codes[-1]) if sort: - level_codes = sorted(level_codes) + level_codes = np.sort(level_codes) level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes)