From 0a4a6e71efc1b1c3060c47adac33da3a46cdc0a9 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 09:35:07 -0800 Subject: [PATCH 1/5] BUG: DataFrame[dt64].where downcasting --- pandas/core/arrays/boolean.py | 2 +- pandas/core/dtypes/cast.py | 11 +++- pandas/core/internals/blocks.py | 43 +++++++++++---- .../tests/arrays/boolean/test_construction.py | 5 +- pandas/tests/dtypes/cast/test_downcast.py | 17 +++++- pandas/tests/frame/indexing/test_where.py | 55 +++++++++++++++++++ 6 files changed, 116 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 6a69d4d610336..7af74772978fe 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -214,7 +214,7 @@ def coerce_to_array( raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: - mask = np.zeros(len(values), dtype=bool) + mask = np.zeros(values.shape, dtype=bool) elif mask is None: mask = mask_values else: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b1d7de0515998..468468a0e986a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -260,9 +260,9 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi dtype = "bool" elif inferred_type == "integer": dtype = "int64" - elif inferred_type == "datetime64": + elif inferred_type in ["datetime", "datetime64"]: dtype = "datetime64[ns]" - elif inferred_type == "timedelta64": + elif inferred_type in ["timedelta", "timedelta64"]: dtype = "timedelta64[ns]" # try to upcast here @@ -290,6 +290,13 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]: result = result.astype(dtype) + elif dtype.kind == "m" and result.dtype == _dtype_obj: + # test_where_downcast_to_td64 + result = array_to_timedelta64(result) + + elif dtype == "M8[ns]" and result.dtype == _dtype_obj: + return np.asarray(maybe_cast_to_datetime(result, dtype=dtype)) + return result diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4c284fdc39822..70dc28348a9ec 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1399,21 +1399,40 @@ def where(self, other, cond) -> list[Block]: except (ValueError, TypeError) as err: _catch_deprecated_value_error(err) - if is_interval_dtype(self.dtype): - # TestSetitemFloatIntervalWithIntIntervalValues - blk = self.coerce_to_target_dtype(orig_other) - nbs = blk.where(orig_other, orig_cond) - return self._maybe_downcast(nbs, "infer") + if self.ndim == 1 or self.shape[0] == 1: - elif isinstance(self, NDArrayBackedExtensionBlock): - # NB: not (yet) the same as - # isinstance(values, NDArrayBackedExtensionArray) - blk = self.coerce_to_target_dtype(orig_other) - nbs = blk.where(orig_other, orig_cond) - return self._maybe_downcast(nbs, "infer") + if is_interval_dtype(self.dtype): + # TestSetitemFloatIntervalWithIntIntervalValues + blk = self.coerce_to_target_dtype(orig_other) + nbs = blk.where(orig_other, orig_cond) + return self._maybe_downcast(nbs, "infer") + + elif isinstance(self, NDArrayBackedExtensionBlock): + # NB: not (yet) the same as + # isinstance(values, NDArrayBackedExtensionArray) + blk = self.coerce_to_target_dtype(orig_other) + nbs = blk.where(orig_other, orig_cond) + return self._maybe_downcast(nbs, "infer") + + else: + raise else: - raise + # Same pattern we use in Block.putmask + is_array = isinstance(orig_other, (np.ndarray, ExtensionArray)) + + res_blocks = [] + nbs = self._split() + for i, nb in enumerate(nbs): + n = orig_other + if is_array: + # we have a different value per-column + n = orig_other[:, i : i + 1] + + submask = orig_cond[:, i : i + 1] + rbs = nb.where(n, submask) + res_blocks.extend(rbs) + return res_blocks nb = self.make_block_same_class(res_values) return [nb] diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index 15f92f2567c1c..64b1786cbd101 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -183,8 +183,11 @@ def test_coerce_to_array(): values = np.array([True, False, True, False], dtype="bool") mask = np.array([False, False, False, True], dtype="bool") + # passing 2D values is OK as long as no mask + coerce_to_array(values.reshape(1, -1)) + with pytest.raises(ValueError, match="values.shape and mask.shape must match"): - coerce_to_array(values.reshape(1, -1)) + coerce_to_array(values.reshape(1, -1), mask=mask) with pytest.raises(ValueError, match="values.shape and mask.shape must match"): coerce_to_array(values, mask=mask.reshape(1, -1)) diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index b1efc1e5677fd..283a5dac9e2e5 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -5,7 +5,11 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype -from pandas import Series +from pandas import ( + Series, + Timedelta, + Timestamp, +) import pandas._testing as tm @@ -34,6 +38,17 @@ "int64", np.array([decimal.Decimal(0.0)]), ), + ( + np.array([Timedelta(days=1), Timedelta(days=2)], dtype=object), + "infer", + np.array([1, 2], dtype="m8[D]").astype("m8[ns]"), + ), + ( + np.array([Timestamp("2016-01-01")] * 2, dtype=object), + "infer", + np.array(["2016-01-01"] * 2, dtype="M8[ns]"), + ), + # TODO: similar for dt64tz, Period, Interval? ], ) def test_downcast(arr, expected, dtype): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index d2fa187106e1b..76e2cb607231a 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -969,3 +969,58 @@ def test_where_inplace_casting(data): df_copy = df.where(pd.notnull(df), None).copy() df.where(pd.notnull(df), None, inplace=True) tm.assert_equal(df, df_copy) + + +def test_where_downcast_to_td64(): + ser = Series([1, 2, 3]) + + mask = np.array([False, False, False]) + + td = pd.Timedelta(days=1) + + res = ser.where(mask, td) + expected = Series([td, td, td], dtype="m8[ns]") + tm.assert_series_equal(res, expected) + + +def _check_where_equivalences(df, mask, other, expected): + # similar to tests.series.indexing.test_setitem.SetitemCastingEquivalences + # but with DataFrame in mind and less fleshed-out + res = df.where(mask, other) + tm.assert_frame_equal(res, expected) + + res = df.mask(~mask, other) + tm.assert_frame_equal(res, expected) + + # Note: we cannot do the same with frame.mask(~mask, other, inplace=True) + # bc that goes through Block.putmask which does *not* downcast. + + +def test_where_dt64_2d(): + dti = date_range("2016-01-01", periods=6) + dta = dti._data.reshape(3, 2) + other = dta - dta[0, 0] + + df = DataFrame(dta, columns=["A", "B"]) + + mask = np.asarray(df.isna()) + mask[:, 1] = True + + # setting all of one column, none of the other + expected = DataFrame({"A": other[:, 0], "B": dta[:, 1]}) + _check_where_equivalences(df, mask, other, expected) + + # setting part of one column, none of the other + mask[1, 0] = True + expected = DataFrame( + { + "A": np.array([other[0, 0], dta[1, 0], other[2, 0]], dtype=object), + "B": dta[:, 1], + } + ) + _check_where_equivalences(df, mask, other, expected) + + # setting nothing in either column + mask[:] = True + expected = df + _check_where_equivalences(df, mask, other, expected) From 93f2ba47c57ccf109cf8a679315f365b64b45280 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 09:38:00 -0800 Subject: [PATCH 2/5] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 571bcb7a6d2b2..fd2265c318208 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -268,6 +268,7 @@ Indexing - Bug in setting a NA value (``None`` or ``np.nan``) into a :class:`Series` with int-based :class:`IntervalDtype` incorrectly casting to object dtype instead of a float-based :class:`IntervalDtype` (:issue:`45568`) - Bug in :meth:`Series.__setitem__` with a non-integer :class:`Index` when using an integer key to set a value that cannot be set inplace where a ``ValueError`` was raised insead of casting to a common dtype (:issue:`45070`) - Bug in :meth:`Series.__setitem__` when setting incompatible values into a ``PeriodDtype`` or ``IntervalDtype`` :class:`Series` raising when indexing with a boolean mask but coercing when indexing with otherwise-equivalent indexers; these now consistently coerce, along with :meth:`Series.mask` and :meth:`Series.where` (:issue:`45768`) +- Bug in :meth:`DataFrame.where` with multiple columns with datetime-like dtypes failing to downcast results consistent with other dtypes (:issue:`45837`) - Bug in :meth:`Series.loc.__setitem__` and :meth:`Series.loc.__getitem__` not raising when using multiple keys without using a :class:`MultiIndex` (:issue:`13831`) - Bug when setting a value too large for a :class:`Series` dtype failing to coerce to a common type (:issue:`26049`, :issue:`32878`) - Bug in :meth:`loc.__setitem__` treating ``range`` keys as positional instead of label-based (:issue:`45479`) @@ -283,7 +284,7 @@ Indexing Missing ^^^^^^^ - Bug in :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``downcast`` keyword not being respected in some cases where there are no NA values present (:issue:`45423`) -- Bug in :meth:`Series.fillna` and :meth:`DataFrame.fillna` with :class:`IntervalDtype` and incompatible value raising instead of casting to a common (usually object) dtype (:issue:`??`) +- Bug in :meth:`Series.fillna` and :meth:`DataFrame.fillna` with :class:`IntervalDtype` and incompatible value raising instead of casting to a common (usually object) dtype (:issue:`45796`) - Bug in :meth:`DataFrame.interpolate` with object-dtype column not returning a copy with ``inplace=False`` (:issue:`45791`) - From e749f4ee3b1125549aa6f5fef78f2bd81001847c Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 09:38:25 -0800 Subject: [PATCH 3/5] GH ref --- pandas/tests/dtypes/cast/test_downcast.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 283a5dac9e2e5..b234ffd5d87b3 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -39,11 +39,13 @@ np.array([decimal.Decimal(0.0)]), ), ( + # GH#45837 np.array([Timedelta(days=1), Timedelta(days=2)], dtype=object), "infer", np.array([1, 2], dtype="m8[D]").astype("m8[ns]"), ), ( + # GH#45837 np.array([Timestamp("2016-01-01")] * 2, dtype=object), "infer", np.array(["2016-01-01"] * 2, dtype="M8[ns]"), From 3aa3bfcf83bee0866b1e5da5c6564b54a83635c5 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 16:21:12 -0800 Subject: [PATCH 4/5] punt on dt64 --- pandas/core/dtypes/cast.py | 2 +- pandas/tests/dtypes/cast/test_downcast.py | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 468468a0e986a..e1f54cbc8c466 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -260,7 +260,7 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi dtype = "bool" elif inferred_type == "integer": dtype = "int64" - elif inferred_type in ["datetime", "datetime64"]: + elif inferred_type == "datetime64": dtype = "datetime64[ns]" elif inferred_type in ["timedelta", "timedelta64"]: dtype = "timedelta64[ns]" diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index b234ffd5d87b3..c01eac746455c 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -8,7 +8,6 @@ from pandas import ( Series, Timedelta, - Timestamp, ) import pandas._testing as tm @@ -44,13 +43,7 @@ "infer", np.array([1, 2], dtype="m8[D]").astype("m8[ns]"), ), - ( - # GH#45837 - np.array([Timestamp("2016-01-01")] * 2, dtype=object), - "infer", - np.array(["2016-01-01"] * 2, dtype="M8[ns]"), - ), - # TODO: similar for dt64tz, Period, Interval? + # TODO: similar for dt64, dt64tz, Period, Interval? ], ) def test_downcast(arr, expected, dtype): From 07bf8a05c96fa1a80af600557d30cbf2bd750771 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Feb 2022 08:26:23 -0800 Subject: [PATCH 5/5] mypy fixup --- pandas/core/dtypes/cast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index bc95d00251264..835c0a90c309d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -292,6 +292,7 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi elif dtype.kind == "m" and result.dtype == _dtype_obj: # test_where_downcast_to_td64 + result = cast(np.ndarray, result) result = array_to_timedelta64(result) elif dtype == "M8[ns]" and result.dtype == _dtype_obj: