From 7a226638b937b39ecdf497eb8d04ee07bf06c1a3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 11 Jan 2025 07:32:52 -0500 Subject: [PATCH 1/7] TST(string dtype): Resolve xfails in test_from_dummies --- pandas/core/reshape/encoding.py | 17 +++++++++-- pandas/tests/reshape/test_from_dummies.py | 35 +++++++++++++++++++---- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 33ff182f5baee..d7d6ada27ba0f 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -17,12 +17,14 @@ is_integer_dtype, is_list_like, is_object_dtype, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, CategoricalDtype, ) +from pandas.core.dtypes.missing import isna from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable @@ -554,9 +556,20 @@ def from_dummies( "Dummy DataFrame contains multi-assignment(s); " f"First instance in row: {assigned.idxmax()}" ) + dtype = data.columns.dtype if any(assigned == 0): if isinstance(default_category, dict): - cats.append(default_category[prefix]) + value = default_category[prefix] + if ( + is_string_dtype(data.columns.dtype) + and not isinstance(value, str) + and (is_list_like(value) or not isna(value)) + ): + # GH#??? + # `value` is not a string or NA. + # Using data.columns.dtype would coerce `value` into a string. + dtype = "object" + cats.append(value) else: raise ValueError( "Dummy DataFrame contains unassigned value(s); " @@ -567,7 +580,7 @@ def from_dummies( ) else: data_slice = data_to_decode.loc[:, prefix_slice] - cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype) + cats_array = data._constructor_sliced(cats, dtype=dtype) # get indices of True entries along axis=1 true_values = data_slice.idxmax(axis=1) indexer = data_slice.columns.get_indexer_for(true_values) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index da1930323f464..4fb48cd21d428 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,8 +1,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - +import pandas as pd from pandas import ( DataFrame, Series, @@ -336,8 +335,6 @@ def test_no_prefix_string_cats_default_category( dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) expected = DataFrame(expected) - if using_infer_string: - expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) @@ -364,7 +361,6 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ @@ -450,3 +446,32 @@ def test_maintain_original_index(): result = from_dummies(df) expected = DataFrame({"": list("abca")}, index=list("abcd")) tm.assert_frame_equal(result, expected) + + +def test_int_columns_with_float_default(): + # GH#??? + df = DataFrame( + { + 3: [1, 0, 0], + 4: [0, 1, 0], + }, + ) + with pytest.raises(ValueError, match="Trying to coerce float values to integers"): + from_dummies(df, default_category=0.5) + + +def test_object_dtype_preserved(): + # GH#??? + # When the input has object dtype, the result should as + # well even when infer_string is True. + df = DataFrame( + { + "x": [1, 0, 0], + "y": [0, 1, 0], + }, + ) + df.columns = df.columns.astype("object") + with pd.option_context("future.infer_string", True): + result = from_dummies(df, default_category="z") + expected = DataFrame({"": ["x", "y", "z"]}, dtype="object") + tm.assert_frame_equal(result, expected) From 40448cc9ef42358768364087f41d74d27b4d5a95 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 11 Jan 2025 12:50:48 -0500 Subject: [PATCH 2/7] Add GH references --- pandas/core/reshape/encoding.py | 2 +- pandas/tests/reshape/test_from_dummies.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index d7d6ada27ba0f..b6c7dc9d1136e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -565,7 +565,7 @@ def from_dummies( and not isinstance(value, str) and (is_list_like(value) or not isna(value)) ): - # GH#??? + # https://github.com/pandas-dev/pandas/pull/60694 # `value` is not a string or NA. # Using data.columns.dtype would coerce `value` into a string. dtype = "object" diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 4fb48cd21d428..ef928db329b48 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -449,7 +449,7 @@ def test_maintain_original_index(): def test_int_columns_with_float_default(): - # GH#??? + # https://github.com/pandas-dev/pandas/pull/60694 df = DataFrame( { 3: [1, 0, 0], @@ -461,7 +461,7 @@ def test_int_columns_with_float_default(): def test_object_dtype_preserved(): - # GH#??? + # https://github.com/pandas-dev/pandas/pull/60694 # When the input has object dtype, the result should as # well even when infer_string is True. df = DataFrame( From 35598ee1db7f21368daf43e944dc321e25614773 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 25 Jan 2025 07:12:28 -0500 Subject: [PATCH 3/7] type-hint --- pandas/core/reshape/encoding.py | 7 +++++-- pandas/tests/io/pytables/test_complex.py | 8 +++----- pandas/tests/io/pytables/test_file_handling.py | 6 ++---- pandas/tests/io/pytables/test_timezones.py | 8 +++----- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index b6c7dc9d1136e..2d77549dd0955 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -37,7 +37,10 @@ from pandas.core.series import Series if TYPE_CHECKING: - from pandas._typing import NpDtype + from pandas._typing import ( + DtypeObj, + NpDtype, + ) def get_dummies( @@ -556,7 +559,7 @@ def from_dummies( "Dummy DataFrame contains multi-assignment(s); " f"First instance in row: {assigned.idxmax()}" ) - dtype = data.columns.dtype + dtype: str | DtypeObj = data.columns.dtype if any(assigned == 0): if isinstance(default_category, dict): value = default_category[prefix] diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index d140cfc941e16..c6eb7670f1e73 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -13,9 +11,9 @@ from pandas.io.pytables import read_hdf -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) +# pytestmark = pytest.mark.xfail( +# using_string_dtype(), reason="TODO(infer_string)", strict=False +# ) def test_complex_fixed(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 16c3c6798ff76..9359a18d162c0 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( PY311, is_ci_environment, @@ -329,7 +327,7 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +# @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -347,7 +345,7 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +# @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8f179f844e4d0..db99f88f0f7ba 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -25,9 +23,9 @@ ensure_clean_store, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) +# pytestmark = pytest.mark.xfail( +# using_string_dtype(), reason="TODO(infer_string)", strict=False +# ) def _compare_with_tz(a, b): From 29295627e1a28aabe925f527b531e2f03fdab2a9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 13 Jul 2025 08:46:55 -0400 Subject: [PATCH 4/7] Revert to a doc update --- pandas/core/reshape/encoding.py | 23 ++++++----------------- pandas/tests/reshape/test_from_dummies.py | 13 +++++++------ 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 0129d82cb9e9b..8fe0e48fa9e6b 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -17,14 +17,12 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, CategoricalDtype, ) -from pandas.core.dtypes.missing import isna from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable @@ -38,7 +36,6 @@ if TYPE_CHECKING: from pandas._typing import ( - DtypeObj, NpDtype, ) @@ -395,7 +392,9 @@ def from_dummies( The default category is the implied category when a value has none of the listed categories specified with a one, i.e. if all dummies in a row are zero. Can be a single value for all variables or a dict directly mapping - the default categories to a prefix of a variable. + the default categories to a prefix of a variable. The default category + will be coerced to the dtype of ``data.columns`` if such coercion is + lossless, and will raise otherwise. Returns ------- @@ -560,20 +559,9 @@ def from_dummies( "Dummy DataFrame contains multi-assignment(s); " f"First instance in row: {assigned.idxmax()}" ) - dtype: str | DtypeObj = data.columns.dtype if any(assigned == 0): if isinstance(default_category, dict): - value = default_category[prefix] - if ( - is_string_dtype(data.columns.dtype) - and not isinstance(value, str) - and (is_list_like(value) or not isna(value)) - ): - # https://github.com/pandas-dev/pandas/pull/60694 - # `value` is not a string or NA. - # Using data.columns.dtype would coerce `value` into a string. - dtype = "object" - cats.append(value) + cats.append(default_category[prefix]) else: raise ValueError( "Dummy DataFrame contains unassigned value(s); " @@ -584,7 +572,8 @@ def from_dummies( ) else: data_slice = data_to_decode.loc[:, prefix_slice] - cats_array = data._constructor_sliced(cats, dtype=dtype) + # cats_array = data._constructor_sliced(cats, dtype=dtype) + cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype) # get indices of True entries along axis=1 true_values = data_slice.idxmax(axis=1) indexer = data_slice.columns.get_indexer_for(true_values) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index 776631ab36978..d1e65aa262d72 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( DataFrame, Series, @@ -334,7 +333,7 @@ def test_no_prefix_string_cats_default_category( ): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) - expected = DataFrame(expected) + expected = DataFrame(expected, dtype=dummies.columns.dtype) tm.assert_frame_equal(result, expected) @@ -466,6 +465,9 @@ def test_object_dtype_preserved(): # https://github.com/pandas-dev/pandas/pull/60694 # When the input has object dtype, the result should as # well even when infer_string is True. + import pandas as pd + + assert pd.get_option("future.infer_string") df = DataFrame( { "x": [1, 0, 0], @@ -473,7 +475,6 @@ def test_object_dtype_preserved(): }, ) df.columns = df.columns.astype("object") - with pd.option_context("future.infer_string", True): - result = from_dummies(df, default_category="z") - expected = DataFrame({"": ["x", "y", "z"]}, dtype="object") - tm.assert_frame_equal(result, expected) + result = from_dummies(df, default_category="z") + expected = DataFrame({"": ["x", "y", "z"]}, dtype="object") + tm.assert_frame_equal(result, expected) From 770c2f5a7babca5b3d0cffbf29651ac900eebea1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 13 Jul 2025 08:47:32 -0400 Subject: [PATCH 5/7] Cleanup --- pandas/core/reshape/encoding.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 8fe0e48fa9e6b..ed53bea636a8f 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -572,7 +572,6 @@ def from_dummies( ) else: data_slice = data_to_decode.loc[:, prefix_slice] - # cats_array = data._constructor_sliced(cats, dtype=dtype) cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype) # get indices of True entries along axis=1 true_values = data_slice.idxmax(axis=1) From c073d81d7a1c874f4179b89474dd93d42c71dc07 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 13 Jul 2025 08:48:02 -0400 Subject: [PATCH 6/7] Cleanup --- pandas/core/reshape/encoding.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index ed53bea636a8f..67fb075110f0d 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -35,9 +35,7 @@ from pandas.core.series import Series if TYPE_CHECKING: - from pandas._typing import ( - NpDtype, - ) + from pandas._typing import NpDtype def get_dummies( From 7a6c847b1a1385a5e2e74e272aecaefdddf0421d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 13 Jul 2025 08:50:15 -0400 Subject: [PATCH 7/7] Cleanup --- pandas/tests/reshape/test_from_dummies.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index d1e65aa262d72..dfb691c785404 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -465,9 +465,6 @@ def test_object_dtype_preserved(): # https://github.com/pandas-dev/pandas/pull/60694 # When the input has object dtype, the result should as # well even when infer_string is True. - import pandas as pd - - assert pd.get_option("future.infer_string") df = DataFrame( { "x": [1, 0, 0],