From 1eb5cd6d0824fa8329b41d4a0e9b39983b923772 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Wed, 10 Aug 2022 16:14:50 +0430 Subject: [PATCH 01/11] ENH: Warn when dtype is not passed to get_dummies --- pandas/core/reshape/encoding.py | 8 +++ pandas/tests/reshape/test_get_dummies.py | 87 +++++++++++++++--------- 2 files changed, 61 insertions(+), 34 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index da4de8cc57e65..bbea1baefcf0f 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -3,11 +3,13 @@ from collections import defaultdict import itertools from typing import Hashable +import warnings import numpy as np from pandas._libs.sparse import IntIndex from pandas._typing import Dtype +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_integer_dtype, @@ -228,6 +230,12 @@ def _get_dummies_1d( codes, levels = factorize_from_iterable(Series(data)) if dtype is None: + warnings.warn( + "The default dtype will change from 'uint8' to 'bool', " + "please specify a dtype to silence this warning", + FutureWarning, + stacklevel=find_stack_level(), + ) dtype = np.dtype(np.uint8) # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 6c9a60caaa2be..063421f82a05a 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -45,6 +45,11 @@ def test_get_dummies_raises_on_dtype_object(self, df): with pytest.raises(ValueError, match=msg): get_dummies(df, dtype="object") + def test_get_dummies_warns_default_dtype(self, df): + msg = "The default dtype will change from 'uint8' to 'bool'" + with tm.assert_produces_warning(FutureWarning, match=msg): + get_dummies(df) + def test_get_dummies_basic(self, sparse, dtype): s_list = list("abc") s_series = Series(s_list) @@ -121,9 +126,11 @@ def test_get_dummies_just_na(self, sparse): just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=["A"]) - res_list = get_dummies(just_na_list, sparse=sparse) - res_series = get_dummies(just_na_series, sparse=sparse) - res_series_index = get_dummies(just_na_series_index, sparse=sparse) + res_list = get_dummies(just_na_list, dtype=np.uint8, sparse=sparse) + res_series = get_dummies(just_na_series, dtype=np.uint8, sparse=sparse) + res_series_index = get_dummies( + just_na_series_index, dtype=np.uint8, sparse=sparse + ) assert res_list.empty assert res_series.empty @@ -169,7 +176,7 @@ def test_get_dummies_unicode(self, sparse): e = "e" eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE") s = [e, eacute, eacute] - res = get_dummies(s, prefix="letter", sparse=sparse) + res = get_dummies(s, dtype=np.uint8, prefix="letter", sparse=sparse) exp = DataFrame( {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8 ) @@ -179,7 +186,7 @@ def test_get_dummies_unicode(self, sparse): def test_dataframe_dummies_all_obj(self, df, sparse): df = df[["A", "B"]] - result = get_dummies(df, sparse=sparse) + result = get_dummies(df, dtype=np.uint8, sparse=sparse) expected = DataFrame( {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8, @@ -200,7 +207,7 @@ def test_dataframe_dummies_string_dtype(self, df): # GH44965 df = df[["A", "B"]] df = df.astype({"A": "object", "B": "string"}) - result = get_dummies(df) + result = get_dummies(df, dtype=np.uint8) expected = DataFrame( { "A_a": [1, 0, 1], @@ -234,7 +241,7 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype): def test_dataframe_dummies_prefix_list(self, df, sparse): prefixes = ["from_A", "from_B"] - result = get_dummies(df, prefix=prefixes, sparse=sparse) + result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse) expected = DataFrame( { "C": [1, 2, 3], @@ -255,7 +262,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... - result = get_dummies(df, prefix="bad", sparse=sparse) + result = get_dummies(df, dtype=np.uint8, prefix="bad", sparse=sparse) bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"] expected = DataFrame( [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], @@ -280,7 +287,9 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): tm.assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse): - result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse) + result = get_dummies( + df, dtype=np.uint8, prefix=["from_A"], columns=["A"], sparse=sparse + ) expected = DataFrame( { "B": ["b", "b", "c"], @@ -298,7 +307,7 @@ def test_dataframe_dummies_subset(self, df, sparse): tm.assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): - result = get_dummies(df, prefix_sep="..", sparse=sparse) + result = get_dummies(df, dtype=np.uint8, prefix_sep="..", sparse=sparse) expected = DataFrame( { "C": [1, 2, 3], @@ -317,11 +326,13 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse): tm.assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse) + result = get_dummies(df, dtype=np.uint8, prefix_sep=["..", "__"], sparse=sparse) expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"}) tm.assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse) + result = get_dummies( + df, dtype=np.uint8, prefix_sep={"A": "..", "B": "__"}, sparse=sparse + ) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self, df, sparse): @@ -330,7 +341,7 @@ def test_dataframe_dummies_prefix_bad_length(self, df, sparse): "encoded (2)" ) with pytest.raises(ValueError, match=msg): - get_dummies(df, prefix=["too few"], sparse=sparse) + get_dummies(df, dtype=np.uint8, prefix=["too few"], sparse=sparse) def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): msg = re.escape( @@ -338,12 +349,12 @@ def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): "encoded (2)" ) with pytest.raises(ValueError, match=msg): - get_dummies(df, prefix_sep=["bad"], sparse=sparse) + get_dummies(df, dtype=np.uint8, prefix_sep=["bad"], sparse=sparse) def test_dataframe_dummies_prefix_dict(self, sparse): prefixes = {"A": "from_A", "B": "from_B"} df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]}) - result = get_dummies(df, prefix=prefixes, sparse=sparse) + result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse) expected = DataFrame( { @@ -453,16 +464,18 @@ def test_get_dummies_basic_drop_first(self, sparse): expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8) - result = get_dummies(s_list, drop_first=True, sparse=sparse) + result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse) if sparse: expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) - result = get_dummies(s_series, drop_first=True, sparse=sparse) + result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) expected.index = list("ABC") - result = get_dummies(s_series_index, drop_first=True, sparse=sparse) + result = get_dummies( + s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse + ) tm.assert_frame_equal(result, expected) def test_get_dummies_basic_drop_first_one_level(self, sparse): @@ -473,27 +486,31 @@ def test_get_dummies_basic_drop_first_one_level(self, sparse): expected = DataFrame(index=np.arange(3)) - result = get_dummies(s_list, drop_first=True, sparse=sparse) + result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) - result = get_dummies(s_series, drop_first=True, sparse=sparse) + result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) expected = DataFrame(index=list("ABC")) - result = get_dummies(s_series_index, drop_first=True, sparse=sparse) + result = get_dummies( + s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse + ) tm.assert_frame_equal(result, expected) def test_get_dummies_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ["a", "b", np.nan] - res = get_dummies(s_NA, drop_first=True, sparse=sparse) + res = get_dummies(s_NA, dtype=np.uint8, drop_first=True, sparse=sparse) exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) if sparse: exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) - res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) + res_na = get_dummies( + s_NA, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse + ) exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex( ["b", np.nan], axis=1 ) @@ -502,14 +519,14 @@ def test_get_dummies_basic_drop_first_NA(self, sparse): tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies( - [np.nan], dummy_na=True, drop_first=True, sparse=sparse + [np.nan], dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse ) exp_just_na = DataFrame(index=np.arange(1)) tm.assert_frame_equal(res_just_na, exp_just_na) def test_dataframe_dummies_drop_first(self, df, sparse): df = df[["A", "B"]] - result = get_dummies(df, drop_first=True, sparse=sparse) + result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse) expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) if sparse: expected = expected.apply(SparseArray, fill_value=0) @@ -517,7 +534,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse): def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): df["cat"] = Categorical(["x", "y", "y"]) - result = get_dummies(df, drop_first=True, sparse=sparse) + result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse) expected = DataFrame( {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]} ) @@ -532,7 +549,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): def test_dataframe_dummies_drop_first_with_na(self, df, sparse): df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies( - df, dummy_na=True, drop_first=True, sparse=sparse + df, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse ).sort_index(axis=1) expected = DataFrame( { @@ -552,18 +569,20 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): tm.assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse) + result = get_dummies( + df, dtype=np.uint8, dummy_na=False, drop_first=True, sparse=sparse + ) expected = expected[["C", "A_b", "B_c"]] tm.assert_frame_equal(result, expected) def test_get_dummies_int_int(self): data = Series([1, 2, 1]) - result = get_dummies(data) + result = get_dummies(data, dtype=np.uint8) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) tm.assert_frame_equal(result, expected) data = Series(Categorical(["a", "b", "a"])) - result = get_dummies(data) + result = get_dummies(data, dtype=np.uint8) expected = DataFrame( [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8 ) @@ -605,7 +624,7 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered): def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]}) - df = get_dummies(df, columns=["Nation"], sparse=sparse) + df = get_dummies(df, dtype=np.uint8, columns=["Nation"], sparse=sparse) df2 = df.reindex(columns=["GDP"]) tm.assert_frame_equal(df[["GDP"]], df2) @@ -613,7 +632,7 @@ def test_get_dummies_dont_sparsify_all_columns(self, sparse): def test_get_dummies_duplicate_columns(self, df): # GH20839 df.columns = ["A", "A", "A"] - result = get_dummies(df).sort_index(axis=1) + result = get_dummies(df, dtype=np.uint8).sort_index(axis=1) expected = DataFrame( [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], @@ -627,7 +646,7 @@ def test_get_dummies_duplicate_columns(self, df): def test_get_dummies_all_sparse(self): df = DataFrame({"A": [1, 2]}) - result = get_dummies(df, columns=["A"], sparse=True) + result = get_dummies(df, dtype=np.uint8, columns=["A"], sparse=True) dtype = SparseDtype("uint8", 0) expected = DataFrame( { @@ -652,4 +671,4 @@ def test_get_dummies_with_string_values(self, values): msg = "Input must be a list-like for parameter `columns`" with pytest.raises(TypeError, match=msg): - get_dummies(df, columns=values) + get_dummies(df, dtype=np.uint8, columns=values) From efa678b8a9b1357ff4473d9fa1996100b4b99515 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Wed, 10 Aug 2022 18:33:40 +0430 Subject: [PATCH 02/11] Edit get_dummies' dtype warning --- pandas/core/reshape/encoding.py | 4 ++-- pandas/tests/reshape/test_get_dummies.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index bbea1baefcf0f..aa2dc6f6e6a3d 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -231,8 +231,8 @@ def _get_dummies_1d( if dtype is None: warnings.warn( - "The default dtype will change from 'uint8' to 'bool', " - "please specify a dtype to silence this warning", + "In a future version of pandas the default dtype will change from " + "'uint8' to 'bool', please specify a dtype to silence this warning", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 063421f82a05a..71c8ac42bd3f2 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -46,6 +46,7 @@ def test_get_dummies_raises_on_dtype_object(self, df): get_dummies(df, dtype="object") def test_get_dummies_warns_default_dtype(self, df): + # https://github.com/pandas-dev/pandas/issues/45848 msg = "The default dtype will change from 'uint8' to 'bool'" with tm.assert_produces_warning(FutureWarning, match=msg): get_dummies(df) From 472fa284273c583a07dfc8b585e831599fd4e32b Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Wed, 10 Aug 2022 18:46:18 +0430 Subject: [PATCH 03/11] Add whatsnew entry for issue #45848 --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a0d33cb513722..91bc932054f5d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -843,6 +843,7 @@ Other Deprecations - Deprecated setting a categorical's categories with ``cat.categories = ['a', 'b', 'c']``, use :meth:`Categorical.rename_categories` instead (:issue:`37643`) - Deprecated unused arguments ``encoding`` and ``verbose`` in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:issue:`47912`) - Deprecated producing a single element when iterating over a :class:`DataFrameGroupBy` or a :class:`SeriesGroupBy` that has been grouped by a list of length 1; A tuple of length one will be returned instead (:issue:`42795`) +- Emit ``FutureWarning`` from :func:`get_dummies` when ``dtype`` is unspecified, indicating that its default value will be changed to ``bool`` (:issue:`45848`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: From 2ead75002a86d50b5651f0d370a53372c9691424 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Wed, 10 Aug 2022 20:02:24 +0430 Subject: [PATCH 04/11] Fix dtype warning test --- pandas/tests/reshape/test_get_dummies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 71c8ac42bd3f2..50f9b4f26a4d2 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -47,7 +47,7 @@ def test_get_dummies_raises_on_dtype_object(self, df): def test_get_dummies_warns_default_dtype(self, df): # https://github.com/pandas-dev/pandas/issues/45848 - msg = "The default dtype will change from 'uint8' to 'bool'" + msg = "In a future version of pandas the default dtype will change" with tm.assert_produces_warning(FutureWarning, match=msg): get_dummies(df) From ddcc7d3bc4dccf0b488b2d154fe89dac1747af6f Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Wed, 10 Aug 2022 20:03:04 +0430 Subject: [PATCH 05/11] Suppress warnings in docs --- doc/source/user_guide/reshaping.rst | 8 ++++++++ doc/source/whatsnew/v0.13.0.rst | 1 + doc/source/whatsnew/v0.15.0.rst | 1 + doc/source/whatsnew/v0.19.0.rst | 1 + doc/source/whatsnew/v0.23.0.rst | 1 + doc/source/whatsnew/v0.24.0.rst | 1 + 6 files changed, 13 insertions(+) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index adca9de6c130a..d0f305168311a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -608,6 +608,7 @@ values, can derive a :class:`DataFrame` containing ``k`` columns of 1s and 0s us :func:`~pandas.get_dummies`: .. ipython:: python + :okwarning: df = pd.DataFrame({"key": list("bbacab"), "data1": range(6)}) @@ -617,6 +618,7 @@ Sometimes it's useful to prefix the column names, for example when merging the r with the original :class:`DataFrame`: .. ipython:: python + :okwarning: dummies = pd.get_dummies(df["key"], prefix="key") dummies @@ -626,6 +628,7 @@ with the original :class:`DataFrame`: This function is often used along with discretization functions like :func:`~pandas.cut`: .. ipython:: python + :okwarning: values = np.random.randn(10) values @@ -642,6 +645,7 @@ variables (categorical in the statistical sense, those with ``object`` or .. ipython:: python + :okwarning: df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["c", "c", "b"], "C": [1, 2, 3]}) pd.get_dummies(df) @@ -650,6 +654,7 @@ All non-object columns are included untouched in the output. You can control the columns that are encoded with the ``columns`` keyword. .. ipython:: python + :okwarning: pd.get_dummies(df, columns=["A"]) @@ -667,6 +672,7 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways: * dict: Mapping column name to prefix. .. ipython:: python + :okwarning: simple = pd.get_dummies(df, prefix="new_prefix") simple @@ -680,6 +686,7 @@ variable to avoid collinearity when feeding the result to statistical models. You can switch to this mode by turn on ``drop_first``. .. ipython:: python + :okwarning: s = pd.Series(list("abcaa")) @@ -690,6 +697,7 @@ You can switch to this mode by turn on ``drop_first``. When a column contains only one level, it will be omitted in the result. .. ipython:: python + :okwarning: df = pd.DataFrame({"A": list("aaaaa"), "B": list("ababc")}) diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 8265ad58f7ea3..f5dc576b4420d 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -501,6 +501,7 @@ Enhancements - ``NaN`` handing in get_dummies (:issue:`4446`) with ``dummy_na`` .. ipython:: python + :okwarning: # previously, nan was erroneously counted as 2 here # now it is not counted at all diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 04506f1655c7d..2df21038eff46 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -1007,6 +1007,7 @@ Other: left untouched. .. ipython:: python + :okwarning: df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], 'C': [1, 2, 3]}) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index f2fdd23af1297..145c87b71b3d1 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -431,6 +431,7 @@ The ``pd.get_dummies`` function now returns dummy-encoded columns as small integ **New behavior**: .. ipython:: python + :okwarning: pd.get_dummies(["a", "b", "a", "c"]).dtypes diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 9f24bc8e8ec50..4a715a5b61a04 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -366,6 +366,7 @@ Function ``get_dummies`` now supports ``dtype`` argument The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtype for the new columns. The default remains uint8. (:issue:`18330`) .. ipython:: python + :okwarning: df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) pd.get_dummies(df, columns=['c']).dtypes diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index f5175283cce4e..3651235bea649 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -833,6 +833,7 @@ then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was retur Now, the return type is consistently a :class:`DataFrame`. .. ipython:: python + :okwarning: type(pd.get_dummies(df, sparse=True)) type(pd.get_dummies(df[['B', 'C']], sparse=True)) From 81dbb875e5f0533d7c56eb4ce410976cc52bdc3a Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Wed, 10 Aug 2022 22:00:34 +0430 Subject: [PATCH 06/11] Edit whatsnew entry Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 91bc932054f5d..77248684ab634 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -843,7 +843,7 @@ Other Deprecations - Deprecated setting a categorical's categories with ``cat.categories = ['a', 'b', 'c']``, use :meth:`Categorical.rename_categories` instead (:issue:`37643`) - Deprecated unused arguments ``encoding`` and ``verbose`` in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:issue:`47912`) - Deprecated producing a single element when iterating over a :class:`DataFrameGroupBy` or a :class:`SeriesGroupBy` that has been grouped by a list of length 1; A tuple of length one will be returned instead (:issue:`42795`) -- Emit ``FutureWarning`` from :func:`get_dummies` when ``dtype`` is unspecified, indicating that its default value will be changed to ``bool`` (:issue:`45848`) +- Deprecated ``np.uint8`` as the default ``dtype`` for :func:`get_dummies` - in a future version, it will be changed to ``bool`` (:issue:`45848`) .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: From f97df66c282e8b8fe707f9f7a2ced1e4a3580fc1 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Tue, 23 Aug 2022 14:05:35 +0430 Subject: [PATCH 07/11] Fix find_stack_level in get_dummies dtype warning --- pandas/core/reshape/encoding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index aa2dc6f6e6a3d..e95d8b70a0bea 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections import defaultdict +import inspect import itertools from typing import Hashable import warnings @@ -234,7 +235,7 @@ def _get_dummies_1d( "In a future version of pandas the default dtype will change from " "'uint8' to 'bool', please specify a dtype to silence this warning", FutureWarning, - stacklevel=find_stack_level(), + stacklevel=find_stack_level(inspect.currentframe()), ) dtype = np.dtype(np.uint8) # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, From 15aeb3e018af148c7abfcce9d6331bc16c13e8c4 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Thu, 22 Sep 2022 13:01:38 +0200 Subject: [PATCH 08/11] Change the default dtype of get_dummies to bool --- doc/source/user_guide/reshaping.rst | 8 - doc/source/whatsnew/v0.13.0.rst | 1 - doc/source/whatsnew/v0.15.0.rst | 1 - doc/source/whatsnew/v0.19.0.rst | 1 - doc/source/whatsnew/v0.23.0.rst | 1 - doc/source/whatsnew/v0.24.0.rst | 1 - doc/source/whatsnew/v1.5.0.rst | 1 - doc/source/whatsnew/v1.5.1.rst | 2 +- pandas/core/reshape/encoding.py | 72 ++++--- pandas/tests/frame/indexing/test_getitem.py | 4 +- .../tests/frame/methods/test_sort_values.py | 2 +- pandas/tests/reshape/test_get_dummies.py | 178 ++++++++---------- 12 files changed, 116 insertions(+), 156 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index d0f305168311a..adca9de6c130a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -608,7 +608,6 @@ values, can derive a :class:`DataFrame` containing ``k`` columns of 1s and 0s us :func:`~pandas.get_dummies`: .. ipython:: python - :okwarning: df = pd.DataFrame({"key": list("bbacab"), "data1": range(6)}) @@ -618,7 +617,6 @@ Sometimes it's useful to prefix the column names, for example when merging the r with the original :class:`DataFrame`: .. ipython:: python - :okwarning: dummies = pd.get_dummies(df["key"], prefix="key") dummies @@ -628,7 +626,6 @@ with the original :class:`DataFrame`: This function is often used along with discretization functions like :func:`~pandas.cut`: .. ipython:: python - :okwarning: values = np.random.randn(10) values @@ -645,7 +642,6 @@ variables (categorical in the statistical sense, those with ``object`` or .. ipython:: python - :okwarning: df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["c", "c", "b"], "C": [1, 2, 3]}) pd.get_dummies(df) @@ -654,7 +650,6 @@ All non-object columns are included untouched in the output. You can control the columns that are encoded with the ``columns`` keyword. .. ipython:: python - :okwarning: pd.get_dummies(df, columns=["A"]) @@ -672,7 +667,6 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways: * dict: Mapping column name to prefix. .. ipython:: python - :okwarning: simple = pd.get_dummies(df, prefix="new_prefix") simple @@ -686,7 +680,6 @@ variable to avoid collinearity when feeding the result to statistical models. You can switch to this mode by turn on ``drop_first``. .. ipython:: python - :okwarning: s = pd.Series(list("abcaa")) @@ -697,7 +690,6 @@ You can switch to this mode by turn on ``drop_first``. When a column contains only one level, it will be omitted in the result. .. ipython:: python - :okwarning: df = pd.DataFrame({"A": list("aaaaa"), "B": list("ababc")}) diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index f5dc576b4420d..8265ad58f7ea3 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -501,7 +501,6 @@ Enhancements - ``NaN`` handing in get_dummies (:issue:`4446`) with ``dummy_na`` .. ipython:: python - :okwarning: # previously, nan was erroneously counted as 2 here # now it is not counted at all diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 2df21038eff46..04506f1655c7d 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -1007,7 +1007,6 @@ Other: left untouched. .. ipython:: python - :okwarning: df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], 'C': [1, 2, 3]}) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 145c87b71b3d1..f2fdd23af1297 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -431,7 +431,6 @@ The ``pd.get_dummies`` function now returns dummy-encoded columns as small integ **New behavior**: .. ipython:: python - :okwarning: pd.get_dummies(["a", "b", "a", "c"]).dtypes diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 4a715a5b61a04..9f24bc8e8ec50 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -366,7 +366,6 @@ Function ``get_dummies`` now supports ``dtype`` argument The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtype for the new columns. The default remains uint8. (:issue:`18330`) .. ipython:: python - :okwarning: df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) pd.get_dummies(df, columns=['c']).dtypes diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3651235bea649..f5175283cce4e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -833,7 +833,6 @@ then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was retur Now, the return type is consistently a :class:`DataFrame`. .. ipython:: python - :okwarning: type(pd.get_dummies(df, sparse=True)) type(pd.get_dummies(df[['B', 'C']], sparse=True)) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c35297c1bcca7..7f968694693f9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -932,7 +932,6 @@ Other Deprecations - Deprecated unused arguments ``encoding`` and ``verbose`` in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:issue:`47912`) - Deprecated the ``inplace`` keyword in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis`, use ``obj = obj.set_axis(..., copy=False)`` instead (:issue:`48130`) - Deprecated producing a single element when iterating over a :class:`DataFrameGroupBy` or a :class:`SeriesGroupBy` that has been grouped by a list of length 1; A tuple of length one will be returned instead (:issue:`42795`) -- Deprecated ``np.uint8`` as the default ``dtype`` for :func:`get_dummies` - in a future version, it will be changed to ``bool`` (:issue:`45848`) - Fixed up warning message of deprecation of :meth:`MultiIndex.lesort_depth` as public method, as the message previously referred to :meth:`MultiIndex.is_lexsorted` instead (:issue:`38701`) - Deprecated the ``sort_columns`` argument in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`). - Deprecated positional arguments for all but the first argument of :meth:`DataFrame.to_stata` and :func:`read_stata`, use keyword arguments instead (:issue:`48128`). diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index 9d40d9118db32..e5e12bea47689 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -23,7 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :func:`get_dummies` with default ``dtype`` being ``uint8`` - the default ``dtype`` is now changed to ``bool`` (:issue:`45848`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 19b1a2ed96fea..ce449675c8db8 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -1,16 +1,13 @@ from __future__ import annotations from collections import defaultdict -import inspect import itertools from typing import Hashable -import warnings import numpy as np from pandas._libs.sparse import IntIndex from pandas._typing import Dtype -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_integer_dtype, @@ -66,7 +63,7 @@ def get_dummies( drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. - dtype : dtype, default np.uint8 + dtype : dtype, default bool Data type for new columns. Only a single dtype is allowed. Returns @@ -89,50 +86,50 @@ def get_dummies( >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) - a b c - 0 1 0 0 - 1 0 1 0 - 2 0 0 1 - 3 1 0 0 + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) - a b - 0 1 0 - 1 0 1 - 2 0 0 + a b + 0 True False + 1 False True + 2 False False >>> pd.get_dummies(s1, dummy_na=True) - a b NaN - 0 1 0 0 - 1 0 1 0 - 2 0 0 1 + a b NaN + 0 True False False + 1 False True False + 2 False False True >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c - 0 1 1 0 0 1 0 - 1 2 0 1 1 0 0 - 2 3 1 0 0 0 1 + 0 1 True False False True False + 1 2 False True True False False + 2 3 True False False False True >>> pd.get_dummies(pd.Series(list('abcaa'))) - a b c - 0 1 0 0 - 1 0 1 0 - 2 0 0 1 - 3 1 0 0 - 4 1 0 0 + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + 4 True False False >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) - b c - 0 0 0 - 1 1 0 - 2 0 1 - 3 0 0 - 4 0 0 + b c + 0 False False + 1 True False + 2 False True + 3 False False + 4 False False >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) a b c @@ -236,16 +233,11 @@ def _get_dummies_1d( codes, levels = factorize_from_iterable(Series(data)) if dtype is None: - warnings.warn( - "In a future version of pandas the default dtype will change from " - "'uint8' to 'bool', please specify a dtype to silence this warning", - FutureWarning, - stacklevel=find_stack_level(inspect.currentframe()), - ) - dtype = np.dtype(np.uint8) + dtype = bool # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" - dtype = np.dtype(dtype) # type: ignore[arg-type] + else: + dtype = np.dtype(dtype) # type: ignore[arg-type] if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index f5c85bd98d8ad..0c1b206cc39bb 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -52,9 +52,7 @@ def test_getitem_list_of_labels_categoricalindex_cols(self): # GH#16115 cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) - expected = DataFrame( - [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats - ) + expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats) dummies = get_dummies(cats) result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 9f3fcb1db546d..26d684bb2fa3a 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -19,7 +19,7 @@ def test_sort_values_sparse_no_warning(self): # GH#45618 # TODO(2.0): test will be unnecessary ser = pd.Series(Categorical(["a", "b", "a"], categories=["a", "b", "c"])) - df = pd.get_dummies(ser, sparse=True) + df = pd.get_dummies(ser, dtype=np.uint8, sparse=True) with tm.assert_produces_warning(None): # No warnings about constructing Index from SparseArray diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 50f9b4f26a4d2..4345a357a0ba8 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -45,12 +45,6 @@ def test_get_dummies_raises_on_dtype_object(self, df): with pytest.raises(ValueError, match=msg): get_dummies(df, dtype="object") - def test_get_dummies_warns_default_dtype(self, df): - # https://github.com/pandas-dev/pandas/issues/45848 - msg = "In a future version of pandas the default dtype will change" - with tm.assert_produces_warning(FutureWarning, match=msg): - get_dummies(df) - def test_get_dummies_basic(self, sparse, dtype): s_list = list("abc") s_series = Series(s_list) @@ -127,11 +121,9 @@ def test_get_dummies_just_na(self, sparse): just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=["A"]) - res_list = get_dummies(just_na_list, dtype=np.uint8, sparse=sparse) - res_series = get_dummies(just_na_series, dtype=np.uint8, sparse=sparse) - res_series_index = get_dummies( - just_na_series_index, dtype=np.uint8, sparse=sparse - ) + res_list = get_dummies(just_na_list, sparse=sparse) + res_series = get_dummies(just_na_series, sparse=sparse) + res_series_index = get_dummies(just_na_series_index, sparse=sparse) assert res_list.empty assert res_series.empty @@ -177,9 +169,9 @@ def test_get_dummies_unicode(self, sparse): e = "e" eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE") s = [e, eacute, eacute] - res = get_dummies(s, dtype=np.uint8, prefix="letter", sparse=sparse) + res = get_dummies(s, prefix="letter", sparse=sparse) exp = DataFrame( - {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8 + {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]} ) if sparse: exp = exp.apply(SparseArray, fill_value=0) @@ -187,18 +179,18 @@ def test_get_dummies_unicode(self, sparse): def test_dataframe_dummies_all_obj(self, df, sparse): df = df[["A", "B"]] - result = get_dummies(df, dtype=np.uint8, sparse=sparse) + result = get_dummies(df, sparse=sparse) expected = DataFrame( {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]}, - dtype=np.uint8, + dtype=bool, ) if sparse: expected = DataFrame( { - "A_a": SparseArray([1, 0, 1], dtype="uint8"), - "A_b": SparseArray([0, 1, 0], dtype="uint8"), - "B_b": SparseArray([1, 1, 0], dtype="uint8"), - "B_c": SparseArray([0, 0, 1], dtype="uint8"), + "A_a": SparseArray([1, 0, 1], dtype="bool"), + "A_b": SparseArray([0, 1, 0], dtype="bool"), + "B_b": SparseArray([1, 1, 0], dtype="bool"), + "B_c": SparseArray([0, 0, 1], dtype="bool"), } ) @@ -208,7 +200,7 @@ def test_dataframe_dummies_string_dtype(self, df): # GH44965 df = df[["A", "B"]] df = df.astype({"A": "object", "B": "string"}) - result = get_dummies(df, dtype=np.uint8) + result = get_dummies(df) expected = DataFrame( { "A_a": [1, 0, 1], @@ -216,7 +208,7 @@ def test_dataframe_dummies_string_dtype(self, df): "B_b": [1, 1, 0], "B_c": [0, 0, 1], }, - dtype=np.uint8, + dtype=bool, ) tm.assert_frame_equal(result, expected) @@ -242,16 +234,15 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype): def test_dataframe_dummies_prefix_list(self, df, sparse): prefixes = ["from_A", "from_B"] - result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse) + result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame( { "C": [1, 2, 3], - "from_A_a": [1, 0, 1], - "from_A_b": [0, 1, 0], - "from_B_b": [1, 1, 0], - "from_B_c": [0, 0, 1], + "from_A_a": [True, False, True], + "from_A_b": [False, True, False], + "from_B_b": [True, True, False], + "from_B_c": [False, False, True], }, - dtype=np.uint8, ) expected[["C"]] = df[["C"]] cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] @@ -263,12 +254,15 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... - result = get_dummies(df, dtype=np.uint8, prefix="bad", sparse=sparse) + result = get_dummies(df, prefix="bad", sparse=sparse) bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"] expected = DataFrame( - [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + [ + [1, True, False, True, False], + [2, False, True, True, False], + [3, True, False, False, True], + ], columns=["C"] + bad_columns, - dtype=np.uint8, ) expected = expected.astype({"C": np.int64}) if sparse: @@ -277,10 +271,10 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): expected = pd.concat( [ Series([1, 2, 3], name="C"), - Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), - Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), - Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), - Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), + Series([True, False, True], name="bad_a", dtype="Sparse[bool]"), + Series([False, True, False], name="bad_b", dtype="Sparse[bool]"), + Series([True, True, False], name="bad_b", dtype="Sparse[bool]"), + Series([False, False, True], name="bad_c", dtype="Sparse[bool]"), ], axis=1, ) @@ -288,9 +282,7 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): tm.assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse): - result = get_dummies( - df, dtype=np.uint8, prefix=["from_A"], columns=["A"], sparse=sparse - ) + result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse) expected = DataFrame( { "B": ["b", "b", "c"], @@ -300,40 +292,37 @@ def test_dataframe_dummies_subset(self, df, sparse): }, ) cols = expected.columns - expected[cols[1:]] = expected[cols[1:]].astype(np.uint8) + expected[cols[1:]] = expected[cols[1:]].astype(bool) expected[["C"]] = df[["C"]] if sparse: cols = ["from_A_a", "from_A_b"] - expected[cols] = expected[cols].astype(SparseDtype("uint8", 0)) + expected[cols] = expected[cols].astype(SparseDtype("bool", 0)) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): - result = get_dummies(df, dtype=np.uint8, prefix_sep="..", sparse=sparse) + result = get_dummies(df, prefix_sep="..", sparse=sparse) expected = DataFrame( { "C": [1, 2, 3], - "A..a": [1, 0, 1], - "A..b": [0, 1, 0], - "B..b": [1, 1, 0], - "B..c": [0, 0, 1], + "A..a": [True, False, True], + "A..b": [False, True, False], + "B..b": [True, True, False], + "B..c": [False, False, True], }, - dtype=np.uint8, ) expected[["C"]] = df[["C"]] expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] if sparse: cols = ["A..a", "A..b", "B..b", "B..c"] - expected[cols] = expected[cols].astype(SparseDtype("uint8", 0)) + expected[cols] = expected[cols].astype(SparseDtype("bool", 0)) tm.assert_frame_equal(result, expected) - result = get_dummies(df, dtype=np.uint8, prefix_sep=["..", "__"], sparse=sparse) + result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse) expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"}) tm.assert_frame_equal(result, expected) - result = get_dummies( - df, dtype=np.uint8, prefix_sep={"A": "..", "B": "__"}, sparse=sparse - ) + result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self, df, sparse): @@ -342,7 +331,7 @@ def test_dataframe_dummies_prefix_bad_length(self, df, sparse): "encoded (2)" ) with pytest.raises(ValueError, match=msg): - get_dummies(df, dtype=np.uint8, prefix=["too few"], sparse=sparse) + get_dummies(df, prefix=["too few"], sparse=sparse) def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): msg = re.escape( @@ -350,12 +339,12 @@ def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): "encoded (2)" ) with pytest.raises(ValueError, match=msg): - get_dummies(df, dtype=np.uint8, prefix_sep=["bad"], sparse=sparse) + get_dummies(df, prefix_sep=["bad"], sparse=sparse) def test_dataframe_dummies_prefix_dict(self, sparse): prefixes = {"A": "from_A", "B": "from_B"} df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]}) - result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse) + result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame( { @@ -368,9 +357,9 @@ def test_dataframe_dummies_prefix_dict(self, sparse): ) columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] - expected[columns] = expected[columns].astype(np.uint8) + expected[columns] = expected[columns].astype(bool) if sparse: - expected[columns] = expected[columns].astype(SparseDtype("uint8", 0)) + expected[columns] = expected[columns].astype(SparseDtype("bool", 0)) tm.assert_frame_equal(result, expected) @@ -434,19 +423,19 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): [ ( {"data": DataFrame({"ä": ["a"]})}, - DataFrame({"ä_a": [1]}, dtype=np.uint8), + DataFrame({"ä_a": [True]}), ), ( {"data": DataFrame({"x": ["ä"]})}, - DataFrame({"x_ä": [1]}, dtype=np.uint8), + DataFrame({"x_ä": [True]}), ), ( {"data": DataFrame({"x": ["a"]}), "prefix": "ä"}, - DataFrame({"ä_a": [1]}, dtype=np.uint8), + DataFrame({"ä_a": [True]}), ), ( {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, - DataFrame({"xäa": [1]}, dtype=np.uint8), + DataFrame({"xäa": [True]}), ), ], ) @@ -463,20 +452,18 @@ def test_get_dummies_basic_drop_first(self, sparse): s_series = Series(s_list) s_series_index = Series(s_list, list("ABC")) - expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8) + expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool) - result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse) + result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) - result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse) + result = get_dummies(s_series, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) expected.index = list("ABC") - result = get_dummies( - s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse - ) + result = get_dummies(s_series_index, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) def test_get_dummies_basic_drop_first_one_level(self, sparse): @@ -487,32 +474,28 @@ def test_get_dummies_basic_drop_first_one_level(self, sparse): expected = DataFrame(index=np.arange(3)) - result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse) + result = get_dummies(s_list, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) - result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse) + result = get_dummies(s_series, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) expected = DataFrame(index=list("ABC")) - result = get_dummies( - s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse - ) + result = get_dummies(s_series_index, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) def test_get_dummies_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ["a", "b", np.nan] - res = get_dummies(s_NA, dtype=np.uint8, drop_first=True, sparse=sparse) - exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) + res = get_dummies(s_NA, drop_first=True, sparse=sparse) + exp = DataFrame({"b": [0, 1, 0]}, dtype=bool) if sparse: exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) - res_na = get_dummies( - s_NA, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse - ) - exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex( + res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) + exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex( ["b", np.nan], axis=1 ) if sparse: @@ -520,27 +503,27 @@ def test_get_dummies_basic_drop_first_NA(self, sparse): tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies( - [np.nan], dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse + [np.nan], dummy_na=True, drop_first=True, sparse=sparse ) exp_just_na = DataFrame(index=np.arange(1)) tm.assert_frame_equal(res_just_na, exp_just_na) def test_dataframe_dummies_drop_first(self, df, sparse): df = df[["A", "B"]] - result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse) - expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) + result = get_dummies(df, drop_first=True, sparse=sparse) + expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool) if sparse: expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): df["cat"] = Categorical(["x", "y", "y"]) - result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse) + result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame( {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]} ) cols = ["A_b", "B_c", "cat_y"] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(bool) expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: @@ -550,7 +533,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): def test_dataframe_dummies_drop_first_with_na(self, df, sparse): df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies( - df, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse + df, dummy_na=True, drop_first=True, sparse=sparse ).sort_index(axis=1) expected = DataFrame( { @@ -562,7 +545,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): } ) cols = ["A_b", "A_nan", "B_c", "B_nan"] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(bool) expected = expected.sort_index(axis=1) if sparse: for col in cols: @@ -570,22 +553,20 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): tm.assert_frame_equal(result, expected) - result = get_dummies( - df, dtype=np.uint8, dummy_na=False, drop_first=True, sparse=sparse - ) + result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse) expected = expected[["C", "A_b", "B_c"]] tm.assert_frame_equal(result, expected) def test_get_dummies_int_int(self): data = Series([1, 2, 1]) - result = get_dummies(data, dtype=np.uint8) - expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) + result = get_dummies(data) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool) tm.assert_frame_equal(result, expected) data = Series(Categorical(["a", "b", "a"])) - result = get_dummies(data, dtype=np.uint8) + result = get_dummies(data) expected = DataFrame( - [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8 + [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool ) tm.assert_frame_equal(result, expected) @@ -625,7 +606,7 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered): def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]}) - df = get_dummies(df, dtype=np.uint8, columns=["Nation"], sparse=sparse) + df = get_dummies(df, columns=["Nation"], sparse=sparse) df2 = df.reindex(columns=["GDP"]) tm.assert_frame_equal(df[["GDP"]], df2) @@ -633,12 +614,15 @@ def test_get_dummies_dont_sparsify_all_columns(self, sparse): def test_get_dummies_duplicate_columns(self, df): # GH20839 df.columns = ["A", "A", "A"] - result = get_dummies(df, dtype=np.uint8).sort_index(axis=1) + result = get_dummies(df).sort_index(axis=1) expected = DataFrame( - [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + [ + [1, True, False, True, False], + [2, False, True, True, False], + [3, True, False, False, True], + ], columns=["A", "A_a", "A_b", "A_b", "A_c"], - dtype=np.uint8, ).sort_index(axis=1) expected = expected.astype({"A": np.int64}) @@ -647,8 +631,8 @@ def test_get_dummies_duplicate_columns(self, df): def test_get_dummies_all_sparse(self): df = DataFrame({"A": [1, 2]}) - result = get_dummies(df, dtype=np.uint8, columns=["A"], sparse=True) - dtype = SparseDtype("uint8", 0) + result = get_dummies(df, columns=["A"], sparse=True) + dtype = SparseDtype("bool", 0) expected = DataFrame( { "A_1": SparseArray([1, 0], dtype=dtype), @@ -672,4 +656,4 @@ def test_get_dummies_with_string_values(self, values): msg = "Input must be a list-like for parameter `columns`" with pytest.raises(TypeError, match=msg): - get_dummies(df, dtype=np.uint8, columns=values) + get_dummies(df, columns=values) From a246b8c2edb6c1d2f7edef41961989592b4f4600 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Sun, 25 Sep 2022 16:32:10 +0200 Subject: [PATCH 09/11] Revert dtype(bool) change --- pandas/core/reshape/encoding.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index a89ac050ce18e..a39e3c1f10956 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -236,11 +236,10 @@ def _get_dummies_1d( codes, levels = factorize_from_iterable(Series(data)) if dtype is None: - dtype = bool + dtype = np.dtype(bool) # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" - else: - dtype = np.dtype(dtype) # type: ignore[arg-type] + dtype = np.dtype(dtype) # type: ignore[arg-type] if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") From 7cef2fcde8c01fb4d6010a2f9c9e8ccd3320bd27 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Fri, 7 Oct 2022 12:45:38 +0200 Subject: [PATCH 10/11] Move the changelog entry to v1.6.0.rst --- doc/source/whatsnew/v1.5.1.rst | 1 - doc/source/whatsnew/v1.6.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index 99db171a624dc..4d7576c013fd6 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -92,7 +92,6 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in :func:`get_dummies` with default ``dtype`` being ``uint8`` - the default ``dtype`` is now changed to ``bool`` (:issue:`45848`) - Bug in :meth:`Series.__getitem__` not falling back to positional for integer keys and boolean :class:`Index` (:issue:`48653`) - Bug in :meth:`DataFrame.to_hdf` raising ``AssertionError`` with boolean index (:issue:`48667`) - Bug in :func:`assert_index_equal` for extension arrays with non matching ``NA`` raising ``ValueError`` (:issue:`48608`) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 0bc91d3cd9637..6b7fa004eadf5 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -256,6 +256,7 @@ Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.pivot_table` raising ``TypeError`` for nullable dtype and ``margins=True`` (:issue:`48681`) - Bug in :meth:`DataFrame.pivot` not respecting ``None`` as column name (:issue:`48293`) +- Bug in :func:`get_dummies` with default ``dtype`` being ``uint8`` - the default ``dtype`` is now changed to ``bool`` (:issue:`45848`) - Bug in :func:`join` when ``left_on`` or ``right_on`` is or includes a :class:`CategoricalIndex` incorrectly raising ``AttributeError`` (:issue:`48464`) - From 8a93cc948cc4b03f4bcc9b9dfb3d8392faeedb91 Mon Sep 17 00:00:00 2001 From: Kian Eliasi Date: Tue, 11 Oct 2022 10:58:15 +0200 Subject: [PATCH 11/11] Move whatsnew entry to 'Other API changes' --- doc/source/whatsnew/v1.6.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index eb0a3807c0c2f..0cad6f3caaf91 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -118,6 +118,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`) - :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser. +- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`) - :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`) - :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`) - @@ -260,7 +261,6 @@ Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.pivot_table` raising ``TypeError`` for nullable dtype and ``margins=True`` (:issue:`48681`) - Bug in :meth:`DataFrame.pivot` not respecting ``None`` as column name (:issue:`48293`) -- Bug in :func:`get_dummies` with default ``dtype`` being ``uint8`` - the default ``dtype`` is now changed to ``bool`` (:issue:`45848`) - Bug in :func:`join` when ``left_on`` or ``right_on`` is or includes a :class:`CategoricalIndex` incorrectly raising ``AttributeError`` (:issue:`48464`) -