From c232efd2f2159b908965281e670eeb59750d26c7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Dec 2023 13:53:45 -0800 Subject: [PATCH 1/4] BUG: rolling with datetime ArrowDtype --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/arrays/datetimelike.py | 7 +++++- pandas/core/dtypes/common.py | 3 ++- pandas/core/window/rolling.py | 22 +++++++++++-------- pandas/tests/window/test_timeseries_window.py | 16 ++++++++++++++ 5 files changed, 38 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 67b4052b386c0..4200bf9074a48 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -645,7 +645,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) -- +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a88f40013b3f6..dc04bb5460e89 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -91,6 +91,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -2497,7 +2498,7 @@ def _validate_inferred_freq( return freq -def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: +def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype | ArrowDtype) -> str: """ Return the unit str corresponding to the dtype's resolution. @@ -2512,4 +2513,8 @@ def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: """ if isinstance(dtype, DatetimeTZDtype): return dtype.unit + elif isinstance(dtype, ArrowDtype): + if dtype.kind not in "mM": + raise ValueError(f"{dtype=} does not have a resolution.") + return dtype.pyarrow_dtype.unit return np.datetime_data(dtype)[0] diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3d12e334e7c0f..89688d80b70fc 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -23,6 +23,7 @@ from pandas.core.dtypes.base import _registry as registry from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -1058,7 +1059,7 @@ def needs_i8_conversion(dtype: DtypeObj | None) -> bool: >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern").dtype) True """ - if isinstance(dtype, np.dtype): + if isinstance(dtype, (np.dtype, ArrowDtype)): return dtype.kind in "mM" return isinstance(dtype, (PeriodDtype, DatetimeTZDtype)) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f90863a8ea1ef..53bd4c4d70243 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -14,7 +14,6 @@ Any, Callable, Literal, - cast, ) import numpy as np @@ -104,6 +103,7 @@ NDFrameT, QuantileInterpolation, WindowingRankType, + npt, ) from pandas import ( @@ -404,11 +404,13 @@ def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None: result[name] = extra_col @property - def _index_array(self): + def _index_array(self) -> npt.NDArray[np.int64] | None: # TODO: why do we get here with e.g. MultiIndex? if needs_i8_conversion(self._on.dtype): - idx = cast("PeriodIndex | DatetimeIndex | TimedeltaIndex", self._on) - return idx.asi8 + if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): + return self._on.asi8 + else: + return self._on.to_numpy(dtype=np.int64) return None def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame: @@ -439,7 +441,7 @@ def _apply_series( self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> Series: """ - Series version of _apply_blockwise + Series version of _apply_columnwise """ obj = self._create_data(self._selected_obj) @@ -455,7 +457,7 @@ def _apply_series( index = self._slice_axis_for_step(obj.index, result) return obj._constructor(result, index=index, name=obj.name) - def _apply_blockwise( + def _apply_columnwise( self, homogeneous_func: Callable[..., ArrayLike], name: str, @@ -614,7 +616,7 @@ def calc(x): return result if self.method == "single": - return self._apply_blockwise(homogeneous_func, name, numeric_only) + return self._apply_columnwise(homogeneous_func, name, numeric_only) else: return self._apply_tablewise(homogeneous_func, name, numeric_only) @@ -1236,7 +1238,9 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step] + return self._apply_columnwise(homogeneous_func, name, numeric_only)[ + :: self.step + ] @doc( _shared_docs["aggregate"], @@ -1871,7 +1875,7 @@ def _validate(self): # we allow rolling on a datetimelike index if ( self.obj.empty - or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) + or (isinstance(self._on, PeriodIndex) or self._on.dtype.kind in "Mm") ) and isinstance(self.window, (str, BaseOffset, timedelta)): self._validate_datetimelike_monotonic() diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index c99fc8a8eb60f..bd0fadeb3e475 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -1,9 +1,12 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, DatetimeIndex, + Index, MultiIndex, NaT, Series, @@ -697,3 +700,16 @@ def test_nat_axis_error(msg, axis): with pytest.raises(ValueError, match=f"{msg} values must not have NaT"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): df.rolling("D", axis=axis).mean() + + +@td.skip_if_no("pyarrow") +def test_arrow_datetime_axis(): + # GH 55849 + expected = Series( + np.arange(5, dtype=np.float64), + index=Index( + date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]" + ), + ) + result = expected.rolling("1D").sum() + tm.assert_series_equal(result, expected) From 15e0869d9e239a85027bf5cdcd05c2f79eefbf3c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:23:43 -0800 Subject: [PATCH 2/4] Dont modify needs_i8_conversion --- pandas/core/dtypes/common.py | 3 +-- pandas/core/window/rolling.py | 10 +++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 89688d80b70fc..3d12e334e7c0f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -23,7 +23,6 @@ from pandas.core.dtypes.base import _registry as registry from pandas.core.dtypes.dtypes import ( - ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -1059,7 +1058,7 @@ def needs_i8_conversion(dtype: DtypeObj | None) -> bool: >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern").dtype) True """ - if isinstance(dtype, (np.dtype, ArrowDtype)): + if isinstance(dtype, np.dtype): return dtype.kind in "mM" return isinstance(dtype, (PeriodDtype, DatetimeTZDtype)) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 53bd4c4d70243..b06cb8dd0ddd8 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -38,6 +38,7 @@ is_numeric_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -406,11 +407,10 @@ def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None: @property def _index_array(self) -> npt.NDArray[np.int64] | None: # TODO: why do we get here with e.g. MultiIndex? - if needs_i8_conversion(self._on.dtype): - if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): - return self._on.asi8 - else: - return self._on.to_numpy(dtype=np.int64) + if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): + return self._on.asi8 + elif isinstance(self._on.dtype, ArrowDtype): + return self._on.to_numpy(dtype=np.int64) return None def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame: From c844b18d3cc4d3e2499f58bfcf9791b0dff71b49 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Dec 2023 09:27:51 -0800 Subject: [PATCH 3/4] More explicit tests --- pandas/core/window/rolling.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index c1c3994d4b4ec..1d83d785be1ef 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -409,7 +409,7 @@ def _index_array(self) -> npt.NDArray[np.int64] | None: # TODO: why do we get here with e.g. MultiIndex? if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): return self._on.asi8 - elif isinstance(self._on.dtype, ArrowDtype): + elif isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM": return self._on.to_numpy(dtype=np.int64) return None @@ -1875,7 +1875,8 @@ def _validate(self): # we allow rolling on a datetimelike index if ( self.obj.empty - or (isinstance(self._on, PeriodIndex) or self._on.dtype.kind in "Mm") + or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) + or (isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM") ) and isinstance(self.window, (str, BaseOffset, timedelta)): self._validate_datetimelike_monotonic() From 45d7a7c9bb5f4ae6409c21bb603c4ade847a821e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Dec 2023 14:14:41 -0800 Subject: [PATCH 4/4] Fix arrow to_numpy --- pandas/core/arrays/arrow/array.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 84d6e2fb7ca53..509b6a04893a0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1302,10 +1302,13 @@ def to_numpy( if dtype is not None: dtype = np.dtype(dtype) - if na_value is lib.no_default: + pa_type = self._pa_array.type + + if na_value is lib.no_default and not ( + pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type) + ): na_value = self.dtype.na_value - pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): data = self else: @@ -1313,16 +1316,9 @@ def to_numpy( copy = False if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): - result = data._maybe_convert_datelike_array() - if (pa.types.is_timestamp(pa_type) and pa_type.tz is not None) or ( - dtype is not None and dtype.kind == "O" - ): - dtype = object - else: - # GH 55997 - dtype = None - na_value = pa_type.to_pandas_dtype().type("nat", pa_type.unit) - result = result.to_numpy(dtype=dtype, na_value=na_value) + result = data._maybe_convert_datelike_array().to_numpy( + dtype=dtype, na_value=na_value + ) elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): # convert to list of python datetime.time objects before # wrapping in ndarray