From 24b07309ef383212c3cdfb1fcf9a33c03c8e316b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 16 Nov 2021 21:03:52 -0800 Subject: [PATCH 1/2] ERR: Raise ValueError when BaseIndexer start & end bounds are unequal length --- doc/source/whatsnew/v1.4.0.rst | 4 +-- pandas/core/indexers/objects.py | 13 +++++++ pandas/core/window/rolling.py | 21 ++++-------- pandas/tests/window/test_base_indexer.py | 43 ++++++++++++++++++++++++ 4 files changed, 64 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2456406f0eca3..7086509f855d7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -676,8 +676,8 @@ Groupby/resample/rolling - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and index is decreasing (:issue:`43927`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) -- Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`#3944`) - +- Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`3944`) +- Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` when using a :class:`pandas.api.indexers.BaseIndexer` subclass that returned unequal start and end arrays would segfault instead of raising a ``ValueError`` (:issue:`44470`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index c4156f214ca68..cc106a13599a8 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -37,6 +37,19 @@ """ +def check_window_bounds(start: np.ndarray, end: np.ndarray, num_vals: int) -> None: + if len(start) != len(end): + raise ValueError( + f"start ({len(start)}) and end ({len(end)}) bounds must be the " + f"same length" + ) + elif len(start) != num_vals: + raise ValueError( + f"start and end bounds ({len(start)}) must be the same length " + f"as the object ({num_vals})" + ) + + class BaseIndexer: """Base class for window bounds calculations.""" diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f7799912937b7..ae0b2d5a5034a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -63,6 +63,7 @@ FixedWindowIndexer, GroupbyIndexer, VariableWindowIndexer, + check_window_bounds, ) from pandas.core.indexes.api import ( DatetimeIndex, @@ -311,10 +312,7 @@ def __iter__(self): center=self.center, closed=self.closed, ) - - assert len(start) == len( - end - ), "these should be equal in length from get_window_bounds" + check_window_bounds(start, end, len(obj)) for s, e in zip(start, end): result = obj.iloc[slice(s, e)] @@ -565,9 +563,7 @@ def calc(x): center=self.center, closed=self.closed, ) - assert len(start) == len( - end - ), "these should be equal in length from get_window_bounds" + check_window_bounds(start, end, len(x)) return func(x, start, end, min_periods, *numba_args) @@ -608,6 +604,7 @@ def _numba_apply( center=self.center, closed=self.closed, ) + check_window_bounds(start, end, len(values)) aggregator = executor.generate_shared_aggregator( func, engine_kwargs, numba_cache_key_str ) @@ -1544,10 +1541,7 @@ def cov_func(x, y): center=self.center, closed=self.closed, ) - - assert len(start) == len( - end - ), "these should be equal in length from get_window_bounds" + check_window_bounds(start, end, len(x_array)) with np.errstate(all="ignore"): mean_x_y = window_aggregations.roll_mean( @@ -1588,10 +1582,7 @@ def corr_func(x, y): center=self.center, closed=self.closed, ) - - assert len(start) == len( - end - ), "these should be equal in length from get_window_bounds" + check_window_bounds(start, end, len(x_array)) with np.errstate(all="ignore"): mean_x_y = window_aggregations.roll_mean( diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index df4666d16ace0..a7ad409683ec8 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -452,3 +452,46 @@ def test_rolling_groupby_with_fixed_forward_many(group_keys, window_size): manual = manual.set_index(["a", "c"])["b"] tm.assert_series_equal(result, manual) + + +def test_unequal_start_end_bounds(): + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + return np.array([1]), np.array([1, 2]) + + indexer = CustomIndexer() + roll = Series(1).rolling(indexer) + match = "start" + with pytest.raises(ValueError, match=match): + roll.mean() + + with pytest.raises(ValueError, match=match): + next(iter(roll)) + + with pytest.raises(ValueError, match=match): + roll.corr(pairwise=True) + + with pytest.raises(ValueError, match=match): + roll.cov(pairwise=True) + + +def test_unequal_bounds_to_object(): + # GH 44470 + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + return np.array([1]), np.array([2]) + + indexer = CustomIndexer() + roll = Series([1, 1]).rolling(indexer) + match = "start and end" + with pytest.raises(ValueError, match=match): + roll.mean() + + with pytest.raises(ValueError, match=match): + next(iter(roll)) + + with pytest.raises(ValueError, match=match): + roll.corr(pairwise=True) + + with pytest.raises(ValueError, match=match): + roll.cov(pairwise=True) From d5a84bb2da3518d9e6914e1d1701707f88d23c3f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 18 Nov 2021 22:22:55 -0800 Subject: [PATCH 2/2] Make check private method --- pandas/core/indexers/objects.py | 13 ------------- pandas/core/window/ewm.py | 7 +++++++ pandas/core/window/rolling.py | 25 +++++++++++++++++++------ 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index cc106a13599a8..c4156f214ca68 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -37,19 +37,6 @@ """ -def check_window_bounds(start: np.ndarray, end: np.ndarray, num_vals: int) -> None: - if len(start) != len(end): - raise ValueError( - f"start ({len(start)}) and end ({len(end)}) bounds must be the " - f"same length" - ) - elif len(start) != num_vals: - raise ValueError( - f"start and end bounds ({len(start)}) must be the same length " - f"as the object ({num_vals})" - ) - - class BaseIndexer: """Base class for window bounds calculations.""" diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index f5f681d9de797..d91388e9722f7 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -417,6 +417,13 @@ def __init__( self.alpha, ) + def _check_window_bounds( + self, start: np.ndarray, end: np.ndarray, num_vals: int + ) -> None: + # emw algorithms are iterative with each point + # ExponentialMovingWindowIndexer "bounds" are the entire window + pass + def _get_window_indexer(self) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ae0b2d5a5034a..73bdc626554a2 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -63,7 +63,6 @@ FixedWindowIndexer, GroupbyIndexer, VariableWindowIndexer, - check_window_bounds, ) from pandas.core.indexes.api import ( DatetimeIndex, @@ -228,6 +227,20 @@ def _validate(self) -> None: if self.method not in ["table", "single"]: raise ValueError("method must be 'table' or 'single") + def _check_window_bounds( + self, start: np.ndarray, end: np.ndarray, num_vals: int + ) -> None: + if len(start) != len(end): + raise ValueError( + f"start ({len(start)}) and end ({len(end)}) bounds must be the " + f"same length" + ) + elif len(start) != num_vals: + raise ValueError( + f"start and end bounds ({len(start)}) must be the same length " + f"as the object ({num_vals})" + ) + def _create_data(self, obj: NDFrameT) -> NDFrameT: """ Split data into blocks & return conformed data. @@ -312,7 +325,7 @@ def __iter__(self): center=self.center, closed=self.closed, ) - check_window_bounds(start, end, len(obj)) + self._check_window_bounds(start, end, len(obj)) for s, e in zip(start, end): result = obj.iloc[slice(s, e)] @@ -563,7 +576,7 @@ def calc(x): center=self.center, closed=self.closed, ) - check_window_bounds(start, end, len(x)) + self._check_window_bounds(start, end, len(x)) return func(x, start, end, min_periods, *numba_args) @@ -604,7 +617,7 @@ def _numba_apply( center=self.center, closed=self.closed, ) - check_window_bounds(start, end, len(values)) + self._check_window_bounds(start, end, len(values)) aggregator = executor.generate_shared_aggregator( func, engine_kwargs, numba_cache_key_str ) @@ -1541,7 +1554,7 @@ def cov_func(x, y): center=self.center, closed=self.closed, ) - check_window_bounds(start, end, len(x_array)) + self._check_window_bounds(start, end, len(x_array)) with np.errstate(all="ignore"): mean_x_y = window_aggregations.roll_mean( @@ -1582,7 +1595,7 @@ def corr_func(x, y): center=self.center, closed=self.closed, ) - check_window_bounds(start, end, len(x_array)) + self._check_window_bounds(start, end, len(x_array)) with np.errstate(all="ignore"): mean_x_y = window_aggregations.roll_mean(