From 6078f5625f3d6ad074a4b6dcfccc5a6e7a478f9c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 18:12:38 -0800 Subject: [PATCH 1/6] support non-monotonic PI --- pandas/core/indexes/period.py | 65 +++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index a54d09e8bede0..caaf00b771a33 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -512,25 +512,20 @@ def get_value(self, series, key): return series.iat[key] if isinstance(key, str): + try: + loc = self._get_string_slice(key) + return series[loc] + except (TypeError, ValueError): + pass + asdt, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - vals = self._ndarray_values - - # if our data is higher resolution than requested key, slice - if grp < freqn: - iv = Period(asdt, freq=(grp, 1)) - ord1 = iv.asfreq(self.freq, how="S").ordinal - ord2 = iv.asfreq(self.freq, how="E").ordinal - - if ord2 < vals[0] or ord1 > vals[-1]: - raise KeyError(key) + # _get_string_slice will handle cases where grp < freqn + assert grp >= freqn - pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) - key = slice(pos[0], pos[1] + 1) - return series[key] - elif grp == freqn: + if grp == freqn: key = Period(asdt, freq=self.freq) loc = self.get_loc(key) return series.iloc[loc] @@ -601,6 +596,11 @@ def get_loc(self, key, method=None, tolerance=None): """ if isinstance(key, str): + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError, OverflowError): + pass + try: asdt, reso = parse_time_string(key, self.freq) key = asdt @@ -713,20 +713,41 @@ def _parsed_string_to_bounds(self, reso, parsed): raise KeyError(reso) return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end")) - def _get_string_slice(self, key): - if not self.is_monotonic: - raise ValueError("Partial indexing only valid for ordered time series") + def _get_string_slice(self, key, use_lhs: bool = True, use_rhs: bool = True): parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - if reso in ["day", "hour", "minute", "second"] and not grp < freqn: - raise KeyError(key) + if not grp < freqn: + raise ValueError(key) t1, t2 = self._parsed_string_to_bounds(reso, parsed) - return slice( - self.searchsorted(t1, side="left"), self.searchsorted(t2, side="right") - ) + i8vals = self.asi8 + + if self.is_monotonic: + + # we are out of range + if len(self) and ( + (use_lhs and t1 < self[0] and t2 < self[0]) + or ((use_rhs and t1 > self[-1] and t2 > self[-1])) + ): + raise KeyError(key) + + # TODO: does this depend on being monotonic _increasing_? + # If so, DTI will also be affected. + + # a monotonic (sorted) series can be sliced + # Use asi8.searchsorted to avoid re-validating Periods + left = i8vals.searchsorted(t1.ordinal, side="left") if use_lhs else None + right = i8vals.searchsorted(t2.ordinal, side="right") if use_rhs else None + return slice(left, right) + + else: + lhs_mask = (i8vals >= t1.ordinal) if use_lhs else True + rhs_mask = (i8vals <= t2.ordinal) if use_rhs else True + + # try to find a the dates + return (lhs_mask & rhs_mask).nonzero()[0] def _convert_tolerance(self, tolerance, target): tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, target) From 484df530f9903661c82be3da8c4d65fad84adfa4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 18:31:34 -0800 Subject: [PATCH 2/6] Test for partial-string indexing on non-monotonic PI --- .../indexes/period/test_partial_slicing.py | 51 +++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 9ca2dd169416f..e67355ecdd86e 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -7,9 +7,6 @@ class TestPeriodIndex: - def setup_method(self, method): - pass - def test_slice_with_negative_step(self): ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M")) SLC = pd.IndexSlice @@ -133,3 +130,51 @@ def test_range_slice_outofbounds(self): tm.assert_frame_equal(df["2013/10/15":"2013/10/17"], empty) tm.assert_frame_equal(df["2013-06":"2013-09"], empty) tm.assert_frame_equal(df["2013-11":"2013-12"], empty) + + def test_partial_slice_doesnt_require_monotonicity(self): + # See also: DatetimeIndex test ofm the same name + dti = pd.date_range("2014-01-01", periods=30, freq="30D") + pi = dti.to_period("D") + + ser_montonic = pd.Series(np.arange(30), index=pi) + + shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2)) + ser = ser_montonic[shuffler] + nidx = ser.index + + # Manually identified locations of year==2014 + indexer_2014 = np.array([0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20]) + assert (nidx[indexer_2014].year == 2014).all() + assert not (nidx[~indexer_2014].year == 2014).any() + + result = nidx.get_loc("2014") + tm.assert_numpy_array_equal(result, indexer_2014) + + expected = ser[indexer_2014] + + result = nidx.get_value(ser, "2014") + tm.assert_series_equal(result, expected) + + result = ser.loc["2014"] + tm.assert_series_equal(result, expected) + + result = ser["2014"] + tm.assert_series_equal(result, expected) + + # Manually identified locations where ser.index is within Mat 2015 + indexer_may2015 = np.array([23]) + assert nidx[23].year == 2015 and nidx[23].month == 5 + + result = nidx.get_loc("May 2015") + tm.assert_numpy_array_equal(result, indexer_may2015) + + expected = ser[indexer_may2015] + + result = nidx.get_value(ser, "May 2015") + tm.assert_series_equal(result, expected) + + result = ser.loc["May 2015"] + tm.assert_series_equal(result, expected) + + result = ser["May 2015"] + tm.assert_series_equal(result, expected) From 7867873cae149a0318f26e8f66fdb9fa2ae93f78 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 19:31:12 -0800 Subject: [PATCH 3/6] 32bit compat --- pandas/tests/indexes/period/test_partial_slicing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index e67355ecdd86e..e26027d33dced 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -143,7 +143,9 @@ def test_partial_slice_doesnt_require_monotonicity(self): nidx = ser.index # Manually identified locations of year==2014 - indexer_2014 = np.array([0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20]) + indexer_2014 = np.array( + [0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20], dtype=np.int64 + ) assert (nidx[indexer_2014].year == 2014).all() assert not (nidx[~indexer_2014].year == 2014).any() @@ -162,7 +164,7 @@ def test_partial_slice_doesnt_require_monotonicity(self): tm.assert_series_equal(result, expected) # Manually identified locations where ser.index is within Mat 2015 - indexer_may2015 = np.array([23]) + indexer_may2015 = np.array([23], dtype=np.int64) assert nidx[23].year == 2015 and nidx[23].month == 5 result = nidx.get_loc("May 2015") From d8a6e489f596d69ab3cde9c5960508380143399c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 20:12:06 -0800 Subject: [PATCH 4/6] troubleshoot 32bit build --- pandas/tests/indexes/period/test_partial_slicing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index e26027d33dced..833901ea7ba22 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -144,7 +144,7 @@ def test_partial_slice_doesnt_require_monotonicity(self): # Manually identified locations of year==2014 indexer_2014 = np.array( - [0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20], dtype=np.int64 + [0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20], dtype=np.intp ) assert (nidx[indexer_2014].year == 2014).all() assert not (nidx[~indexer_2014].year == 2014).any() @@ -164,7 +164,7 @@ def test_partial_slice_doesnt_require_monotonicity(self): tm.assert_series_equal(result, expected) # Manually identified locations where ser.index is within Mat 2015 - indexer_may2015 = np.array([23], dtype=np.int64) + indexer_may2015 = np.array([23], dtype=np.intp) assert nidx[23].year == 2015 and nidx[23].month == 5 result = nidx.get_loc("May 2015") From ab92c5cabb94586eac8929a11272f0ba7a67c20b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 17 Jan 2020 10:28:07 -0800 Subject: [PATCH 5/6] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b5a7b19f160a4..74c648bd6fa85 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -13,6 +13,24 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_110.period_index_partial_string_slicing: + +Nonmonotonic PeriodIndex Partial String Slicing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`) + +For example: + +.. ipython:: python + + dti = pd.date_range("2014-01-01", periods=30, freq="30D") + pi = dti.to_period("D") + ser_monotonic = pd.Series(np.arange(30), index=pi) + shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2)) + ser = ser_monotonic[shuffler] + ser["2014"] + ser.loc["May 2015"] + .. _whatsnew_110.enhancements.other: Other enhancements From a8cc7319239643b676c1fb441a157f1026a5da29 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 19 Jan 2020 13:27:10 -0800 Subject: [PATCH 6/6] Update docs --- doc/source/user_guide/timeseries.rst | 3 +++ doc/source/whatsnew/v1.1.0.rst | 2 ++ 2 files changed, 5 insertions(+) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 08b2ae0a4a837..ac2d97382c93f 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1981,6 +1981,9 @@ As with ``DatetimeIndex``, the endpoints will be included in the result. The exa dfp['2013-01-01 10H':'2013-01-01 11H'] +As of version 1.1.0, this works with non-monotonic ``PeriodIndex`` indexes. + + Frequency conversion and resampling with PeriodIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The frequency of ``Period`` and ``PeriodIndex`` can be converted via the ``asfreq`` diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bfe7a7687db06..a8624afd16ee0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -28,6 +28,8 @@ For example: ser_monotonic = pd.Series(np.arange(30), index=pi) shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2)) ser = ser_monotonic[shuffler] + ser + ser["2014"] ser.loc["May 2015"]