From a8498ee83ab3964730b047ede393fba34d3bcdc4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 24 Mar 2023 14:42:57 -0400 Subject: [PATCH 1/3] BUG: to_numeric converting StringArray to object or float64 --- pandas/core/tools/numeric.py | 20 ++++++++++++++---- pandas/tests/tools/test_to_numeric.py | 30 +++++++++++++++++++++------ 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 97900eacd1f5d..c44708f9a807c 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -19,17 +19,19 @@ is_integer_dtype, is_number, is_numeric_dtype, - is_object_dtype, is_scalar, + is_string_dtype, needs_i8_conversion, ) from pandas.core.dtypes.generic import ( ABCIndex, ABCSeries, ) +from pandas.core.dtypes.missing import isna from pandas.core.arrays import BaseMaskedArray from pandas.core.arrays.arrow import ArrowDtype +from pandas.core.arrays.string_ import StringDtype if TYPE_CHECKING: from pandas._typing import ( @@ -196,6 +198,8 @@ def to_numeric( else: values = arg + orig_values = values + # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting mask: npt.NDArray[np.bool_] | None = None @@ -225,6 +229,7 @@ def to_numeric( except (ValueError, TypeError): if errors == "raise": raise + values = orig_values if new_mask is not None: # Remove unnecessary values, is expected later anyway and enables @@ -264,9 +269,16 @@ def to_numeric( break # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct - # masked array - if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype): - if mask is None: + # masked array. For StringArray need to compute a mask if conversion was successful. + if ( + mask is not None + or new_mask is not None + or isinstance(values_dtype, StringDtype) + ) and not is_string_dtype(values.dtype): + if mask is None and isinstance(values_dtype, StringDtype): + mask = isna(values) + values = values[~mask] + elif mask is None: mask = new_mask else: mask = mask.copy() diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 4a0b01a275523..b8b78d59f71d1 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -723,12 +723,12 @@ def test_precision_float_conversion(strrep): @pytest.mark.parametrize( "values, expected", [ - (["1", "2", None], Series([1, 2, np.nan])), - (["1", "2", "3"], Series([1, 2, 3])), - (["1", "2", 3], Series([1, 2, 3])), - (["1", "2", 3.5], Series([1, 2, 3.5])), - (["1", None, 3.5], Series([1, np.nan, 3.5])), - (["1", "2", "3.5"], Series([1, 2, 3.5])), + (["1", "2", None], Series([1, 2, np.nan], dtype="Float64")), + (["1", "2", "3"], Series([1, 2, 3], dtype="Int64")), + (["1", "2", 3], Series([1, 2, 3], dtype="Int64")), + (["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")), + (["1", None, 3.5], Series([1, np.nan, 3.5], dtype="Float64")), + (["1", "2", "3.5"], Series([1, 2, 3.5], dtype="Float64")), ], ) def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected): @@ -738,6 +738,24 @@ def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected tm.assert_series_equal(result, expected) +def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype): + # GH#52146 + values = ["a", "1"] + ser = Series(values, dtype=nullable_string_dtype) + result = to_numeric(ser, errors="coerce") + expected = Series([pd.NA, 1], dtype="Float64") + tm.assert_series_equal(result, expected) + + +def test_to_numeric_from_nullable_string_ignore(nullable_string_dtype): + # GH#52146 + values = ["a", "1"] + ser = Series(values, dtype=nullable_string_dtype) + expected = ser.copy() + result = to_numeric(ser, errors="ignore") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "data, input_dtype, downcast, expected_dtype", ( From 3704c3d67baa8a94a113673fa9c81dcd447975e2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 24 Mar 2023 14:55:29 -0400 Subject: [PATCH 2/3] BUG: to_numeric converting StringArray to object or float64 --- pandas/_libs/lib.pyx | 8 ++++++-- pandas/core/tools/numeric.py | 21 +++++++++------------ pandas/tests/tools/test_to_numeric.py | 4 ++-- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 88ea61a23a426..c3bb33df34e56 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2325,10 +2325,14 @@ def maybe_convert_numeric( if not seen.coerce_numeric: raise type(err)(f"{err} at position {i}") - seen.saw_null() - floats[i] = NaN mask[i] = 1 + if allow_null_in_int: + seen.null_ = True + else: + seen.saw_null() + floats[i] = NaN + if seen.check_uint64_conflict(): return (values, None) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index c44708f9a807c..8a0e322135b99 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -27,7 +27,6 @@ ABCIndex, ABCSeries, ) -from pandas.core.dtypes.missing import isna from pandas.core.arrays import BaseMaskedArray from pandas.core.arrays.arrow import ArrowDtype @@ -224,7 +223,8 @@ def to_numeric( values, set(), coerce_numeric=coerce_numeric, - convert_to_masked_nullable=dtype_backend is not lib.no_default, + convert_to_masked_nullable=dtype_backend is not lib.no_default + or isinstance(values_dtype, StringDtype), ) except (ValueError, TypeError): if errors == "raise": @@ -235,7 +235,11 @@ def to_numeric( # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] - elif dtype_backend is not lib.no_default and new_mask is None: + elif ( + dtype_backend is not lib.no_default + and new_mask is None + or isinstance(values_dtype, StringDtype) + ): new_mask = np.zeros(values.shape, dtype=np.bool_) # attempt downcast only if the data has been successfully converted @@ -270,15 +274,8 @@ def to_numeric( # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct # masked array. For StringArray need to compute a mask if conversion was successful. - if ( - mask is not None - or new_mask is not None - or isinstance(values_dtype, StringDtype) - ) and not is_string_dtype(values.dtype): - if mask is None and isinstance(values_dtype, StringDtype): - mask = isna(values) - values = values[~mask] - elif mask is None: + if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): + if mask is None: mask = new_mask else: mask = mask.copy() diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index b8b78d59f71d1..fe6794b120681 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -723,7 +723,7 @@ def test_precision_float_conversion(strrep): @pytest.mark.parametrize( "values, expected", [ - (["1", "2", None], Series([1, 2, np.nan], dtype="Float64")), + (["1", "2", None], Series([1, 2, np.nan], dtype="Int64")), (["1", "2", "3"], Series([1, 2, 3], dtype="Int64")), (["1", "2", 3], Series([1, 2, 3], dtype="Int64")), (["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")), @@ -743,7 +743,7 @@ def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype): values = ["a", "1"] ser = Series(values, dtype=nullable_string_dtype) result = to_numeric(ser, errors="coerce") - expected = Series([pd.NA, 1], dtype="Float64") + expected = Series([pd.NA, 1], dtype="Int64") tm.assert_series_equal(result, expected) From 333e30b942cdc1d5609f54fece2e19c02c10de34 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 24 Mar 2023 14:58:20 -0400 Subject: [PATCH 3/3] Update comment --- pandas/core/tools/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 8a0e322135b99..04443f89ddf6f 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -273,7 +273,7 @@ def to_numeric( break # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct - # masked array. For StringArray need to compute a mask if conversion was successful. + # masked array if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): if mask is None: mask = new_mask