From 0270839bcca394536cd5421c0d6ea2263c1ff164 Mon Sep 17 00:00:00 2001 From: Thomas Spura Date: Fri, 21 Dec 2018 09:46:30 +0100 Subject: [PATCH 1/7] Series.value_counts: Preserve original ordering when using sort=False Ensure that value_counts returns the same ordering of the indices than the input object when sorting the values no matter if it is ascending or descending. This fixes #12679. --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/algorithms.py | 6 +++++- pandas/tests/test_algos.py | 36 +++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 78f864f0dcb73..708091e2c48e0 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1634,6 +1634,7 @@ Other - Bug where C variables were declared with external linkage causing import errors if certain other C libraries were imported before Pandas. (:issue:`24113`) - Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) +- :meth:`Series.value_counts` returns the counts in the same ordering as the original series when using ``sort=False`` (:issue:`12679`) .. _whatsnew_0.24.0.contributors: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3c4fe519e4181..efed19751fc98 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -666,7 +666,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, value_counts : Series """ - from pandas.core.series import Series, Index + from pandas import Series, Index, CategoricalIndex name = getattr(values, 'name', None) if bins is not None: @@ -708,6 +708,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if sort: result = result.sort_values(ascending=ascending) + else: + uniq = unique(values) + if not isinstance(result.index, CategoricalIndex): + result = result.reindex(uniq) if normalize: result = result / float(counts.sum()) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c9d403f6696af..fb20f0aace6e0 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -962,6 +962,42 @@ def test_value_counts_uint64(self): if not compat.is_platform_32bit(): tm.assert_series_equal(result, expected) + def test_value_counts_nonsorted(self): + # All items occour exactly once. + # No matter if sorted or not, the resulting values should be in + # the same order. + s = Series(list('bacdef')) + + # Garantee the same index if value_counts(sort=False) is used + vc = s.value_counts(sort=False, ascending=False) + tm.assert_series_equal(Series(vc.index), s) + vc = s.value_counts(sort=False, ascending=True) + tm.assert_series_equal(Series(vc.index), s) + + # Garantee does not hold yet for the sort=True case + # vc = s.value_counts(sort=True, ascending=False) + # tm.assert_series_equal(Series(vc.index), s) + # vc = s.value_counts(sort=True, ascending=True) + # tm.assert_series_equal(Series(vc.index), s) + + # 'a' is there twice. Sorted, it should be there at the top. + # Unsorted it should stay where it is. + s = Series(list('bacaef')) + ref_nonsorted = Series(list('bacef')) + ref_sorted = Series(list('abcef')) + + # Garantee the same index if value_counts(sort=False) is used + vc = s.value_counts(sort=False, ascending=False) + tm.assert_series_equal(Series(vc.index), ref_nonsorted) + vc = s.value_counts(sort=False, ascending=True) + tm.assert_series_equal(Series(vc.index), ref_nonsorted) + + # Garantee does not hold yet for the sort=True case + # vc = s.value_counts(sort=True, ascending=False) + # tm.assert_series_equal(Series(vc.index), ref_sorted) + # vc = s.value_counts(sort=True, ascending=True) + # tm.assert_series_equal(Series(vc.index), ref_sorted) + class TestDuplicated(object): From c8571195ee133523c57c28df794bf1e3bcd26bde Mon Sep 17 00:00:00 2001 From: Thomas Spura Date: Fri, 21 Dec 2018 12:37:17 +0100 Subject: [PATCH 2/7] Only reindex value_counts if bins is None --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index efed19751fc98..17d64b937e0a0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -708,7 +708,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if sort: result = result.sort_values(ascending=ascending) - else: + elif bins is None: uniq = unique(values) if not isinstance(result.index, CategoricalIndex): result = result.reindex(uniq) From e966aa7a137c284971bc8a9e85f5a7da38694828 Mon Sep 17 00:00:00 2001 From: Thomas Spura Date: Sun, 23 Dec 2018 14:34:35 +0100 Subject: [PATCH 3/7] Modularize value_counts tests and xfail sort=True ones --- pandas/tests/test_algos.py | 40 ++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fb20f0aace6e0..2d05752eb1f98 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -962,41 +962,51 @@ def test_value_counts_uint64(self): if not compat.is_platform_32bit(): tm.assert_series_equal(result, expected) - def test_value_counts_nonsorted(self): + def test_value_counts_nonsorted_single_occurance(self): + # GH 12679 # All items occour exactly once. # No matter if sorted or not, the resulting values should be in # the same order. s = Series(list('bacdef')) - # Garantee the same index if value_counts(sort=False) is used + # Guarantee the same index if value_counts(sort=False) is used vc = s.value_counts(sort=False, ascending=False) tm.assert_series_equal(Series(vc.index), s) vc = s.value_counts(sort=False, ascending=True) tm.assert_series_equal(Series(vc.index), s) - # Garantee does not hold yet for the sort=True case - # vc = s.value_counts(sort=True, ascending=False) - # tm.assert_series_equal(Series(vc.index), s) - # vc = s.value_counts(sort=True, ascending=True) - # tm.assert_series_equal(Series(vc.index), s) + @pytest.mark.xfail(reason="sort=True does not guarantee the same order") + def test_value_counts_sorted_single_occurance(self): + # GH 12679 + s = Series(list('bacdef')) + # Guarantee does not hold yet for the sort=True case + vc = s.value_counts(sort=True, ascending=False) + tm.assert_series_equal(Series(vc.index), s) + vc = s.value_counts(sort=True, ascending=True) + tm.assert_series_equal(Series(vc.index), s) + def test_value_counts_nonsorted_double_occurance(self): + # GH 12679 # 'a' is there twice. Sorted, it should be there at the top. - # Unsorted it should stay where it is. s = Series(list('bacaef')) ref_nonsorted = Series(list('bacef')) - ref_sorted = Series(list('abcef')) - # Garantee the same index if value_counts(sort=False) is used + # Guarantee the same index if value_counts(sort=False) is used vc = s.value_counts(sort=False, ascending=False) tm.assert_series_equal(Series(vc.index), ref_nonsorted) vc = s.value_counts(sort=False, ascending=True) tm.assert_series_equal(Series(vc.index), ref_nonsorted) - # Garantee does not hold yet for the sort=True case - # vc = s.value_counts(sort=True, ascending=False) - # tm.assert_series_equal(Series(vc.index), ref_sorted) - # vc = s.value_counts(sort=True, ascending=True) - # tm.assert_series_equal(Series(vc.index), ref_sorted) + @pytest.mark.xfail(reason="sort=True does not guarantee the same order") + def test_value_counts_sorted_double_occurance(self): + # GH 12679 + s = Series(list('bacaef')) + ref_sorted = Series(list('abcef')) + # Guarantee does not hold yet for the sort=True case + vc = s.value_counts(sort=True, ascending=False) + tm.assert_series_equal(Series(vc.index), ref_sorted) + vc = s.value_counts(sort=True, ascending=True) + tm.assert_series_equal(Series(vc.index), ref_sorted) class TestDuplicated(object): From b827df693cd85c6d8c3e56126240c76fbf78f9f0 Mon Sep 17 00:00:00 2001 From: Thomas Spura Date: Sun, 23 Dec 2018 17:27:27 +0100 Subject: [PATCH 4/7] Move whatsnew entry to correct place --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 708091e2c48e0..4d64997ccfe56 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -385,6 +385,7 @@ Backwards incompatible API changes - :func:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) - The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`) - :func:`pd.offsets.generate_range` argument ``time_rule`` has been removed; use ``offset`` instead (:issue:`24157`) +- :meth:`Series.value_counts` returns the counts in the same ordering as the original series when using ``sort=False`` (:issue:`12679`) Percentage change on groupby ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1634,7 +1635,6 @@ Other - Bug where C variables were declared with external linkage causing import errors if certain other C libraries were imported before Pandas. (:issue:`24113`) - Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) -- :meth:`Series.value_counts` returns the counts in the same ordering as the original series when using ``sort=False`` (:issue:`12679`) .. _whatsnew_0.24.0.contributors: From 453d3dfc763acd1aa66452d146ce449aee201cae Mon Sep 17 00:00:00 2001 From: Thomas Spura Date: Sun, 23 Dec 2018 18:00:17 +0100 Subject: [PATCH 5/7] Use ABCCategoricalIndex instead of CategoricalIndex --- pandas/core/algorithms.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 17d64b937e0a0..6faec1eeecdb4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -666,7 +666,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False, value_counts : Series """ - from pandas import Series, Index, CategoricalIndex + from pandas.core.series import Series, Index + from pandas.core.dtypes.generic import ABCCategoricalIndex name = getattr(values, 'name', None) if bins is not None: @@ -710,7 +711,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, result = result.sort_values(ascending=ascending) elif bins is None: uniq = unique(values) - if not isinstance(result.index, CategoricalIndex): + if not isinstance(result.index, ABCCategoricalIndex): result = result.reindex(uniq) if normalize: From 405ac0ea232422f4c7e47bc732ea48e2c5ff39c8 Mon Sep 17 00:00:00 2001 From: Thomas Spura Date: Sun, 23 Dec 2018 19:11:34 +0100 Subject: [PATCH 6/7] Calculate unique for the EA vs non-EA case --- pandas/core/algorithms.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6faec1eeecdb4..65fefe265dadc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -696,12 +696,15 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if is_extension_array_dtype(values) or is_sparse(values): # handle Categorical and sparse, - result = Series(values)._values.value_counts(dropna=dropna) + vals = Series(values) + uniq = vals._values.unique() + result = vals._values.value_counts(dropna=dropna) result.name = name counts = result.values else: keys, counts = _value_counts_arraylike(values, dropna) + uniq = unique(values) if not isinstance(keys, Index): keys = Index(keys) @@ -710,7 +713,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if sort: result = result.sort_values(ascending=ascending) elif bins is None: - uniq = unique(values) if not isinstance(result.index, ABCCategoricalIndex): result = result.reindex(uniq) From 355f872b29fc7eba57379a213562b8e5e5e995bb Mon Sep 17 00:00:00 2001 From: Thomas Spura Date: Sun, 23 Dec 2018 19:14:39 +0100 Subject: [PATCH 7/7] Parametrize value_counts test --- pandas/tests/test_algos.py | 50 +++++++++++++++----------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 2d05752eb1f98..cb32700549c76 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -962,51 +962,41 @@ def test_value_counts_uint64(self): if not compat.is_platform_32bit(): tm.assert_series_equal(result, expected) - def test_value_counts_nonsorted_single_occurance(self): + @pytest.mark.parametrize("sort", + [pytest.param(True, marks=pytest.mark.xfail), False]) + @pytest.mark.parametrize("ascending", [True, False]) + def test_value_counts_single_occurance(self, sort, ascending): # GH 12679 # All items occour exactly once. # No matter if sorted or not, the resulting values should be in # the same order. - s = Series(list('bacdef')) + expected = Series(list('bacdef')) # Guarantee the same index if value_counts(sort=False) is used - vc = s.value_counts(sort=False, ascending=False) - tm.assert_series_equal(Series(vc.index), s) - vc = s.value_counts(sort=False, ascending=True) - tm.assert_series_equal(Series(vc.index), s) + vc = expected.value_counts(sort=sort, ascending=ascending) + result = Series(vc.index) + tm.assert_series_equal(result, expected) - @pytest.mark.xfail(reason="sort=True does not guarantee the same order") - def test_value_counts_sorted_single_occurance(self): - # GH 12679 - s = Series(list('bacdef')) - # Guarantee does not hold yet for the sort=True case - vc = s.value_counts(sort=True, ascending=False) - tm.assert_series_equal(Series(vc.index), s) - vc = s.value_counts(sort=True, ascending=True) - tm.assert_series_equal(Series(vc.index), s) - - def test_value_counts_nonsorted_double_occurance(self): + @pytest.mark.parametrize("ascending", [True, False]) + def test_value_counts_nonsorted_double_occurance(self, ascending): # GH 12679 # 'a' is there twice. Sorted, it should be there at the top. s = Series(list('bacaef')) - ref_nonsorted = Series(list('bacef')) + expected = Series(list('bacef')) - # Guarantee the same index if value_counts(sort=False) is used - vc = s.value_counts(sort=False, ascending=False) - tm.assert_series_equal(Series(vc.index), ref_nonsorted) - vc = s.value_counts(sort=False, ascending=True) - tm.assert_series_equal(Series(vc.index), ref_nonsorted) + vc = s.value_counts(sort=False, ascending=ascending) + result = Series(vc.index) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("ascending, expected", + [(True, Series(list('abcef'))), + (False, Series(list('bcefa')))]) @pytest.mark.xfail(reason="sort=True does not guarantee the same order") - def test_value_counts_sorted_double_occurance(self): + def test_value_counts_sorted_double_occurance(self, ascending, expected): # GH 12679 s = Series(list('bacaef')) - ref_sorted = Series(list('abcef')) - # Guarantee does not hold yet for the sort=True case - vc = s.value_counts(sort=True, ascending=False) - tm.assert_series_equal(Series(vc.index), ref_sorted) - vc = s.value_counts(sort=True, ascending=True) - tm.assert_series_equal(Series(vc.index), ref_sorted) + vc = s.value_counts(sort=True, ascending=ascending) + tm.assert_series_equal(Series(vc.index), expected) class TestDuplicated(object):