diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 80317d6806346..623fdaec7f565 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1326,6 +1326,7 @@ Timedelta - Bug in :class:`Timedelta` and :func:`to_timedelta()` have inconsistencies in supported unit string (:issue:`21762`) - Bug in :class:`TimedeltaIndex` division where dividing by another :class:`TimedeltaIndex` raised ``TypeError`` instead of returning a :class:`Float64Index` (:issue:`23829`, :issue:`22631`) - Bug in :class:`TimedeltaIndex` comparison operations where comparing against non-``Timedelta``-like objects would raise ``TypeError`` instead of returning all-``False`` for ``__eq__`` and all-``True`` for ``__ne__`` (:issue:`24056`) +- Bug when intersecting a decreasing TimedeltaIndex (:issue:`17391`) Timezones ^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 811d66c74ed15..909c551a37201 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1584,6 +1584,12 @@ def is_monotonic(self): """ return self.is_monotonic_increasing + @property + def _is_strictly_monotonic(self): + """ Checks if the index is sorted """ + return (self._is_strictly_monotonic_increasing or + self._is_strictly_monotonic_decreasing) + @property def is_monotonic_increasing(self): """ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index dd2537c11a94c..08950ab8b5750 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -29,6 +29,9 @@ import pandas.io.formats.printing as printing +from pandas.tseries.offsets import index_offsets_equal +import pandas.tseries.frequencies as frequencies + _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -572,6 +575,92 @@ def _time_shift(self, periods, freq=None): result.name = self.name return result + def _fast_intersection(self, other): + """ + Speedy intersection that works only if certain assumptions are met. + See intersection for details. + Parameters + ---------- + other + + Returns + ------- + + """ + # Coerce into the same order + ascending = self.is_monotonic_increasing + if ascending != other.is_monotonic_increasing: + other = other.sort_values(ascending=ascending) + # Lots of 'if ascending' calls here to setup mirrored function calls + first_comparison = '__le__' if ascending else '__ge__' + second_comparison = '__lt__' if ascending else '__gt__' + if getattr(self[0], first_comparison)(other[0]): + left, right = self, other + else: + left, right = other, self + + if ascending: + start = right[0] + end = min(left[-1], right[-1]) + else: + start = min(left[0], right[0]) + end = right[-1] + if getattr(end, second_comparison, start): + return left.values[slice(*left.slice_locs(start, end))] + return [] + + def intersection(self, other): + """ + Specialized intersection for DateTimeIndexOpsMixin objects. + May be much faster than Index.intersection. + + Fast intersection will occur if + 1. Both are in a sorted order + 2. Both indexes have a `freq` , and it's the same `freq` + 3. Both are monotonic + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + Index + A shallow copied intersection between the two things passed in + """ + # Run a few checks, and perform a regular intersection + # if the conditions aren't just right for fast intersection + # Perform a regular Index.intersection + self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + lengths = len(self), len(other) + if lengths[0] == 0: + return self + if lengths[1] == 0: + return other + + if (not index_offsets_equal(self, other) or + not other.freq.isAnchored() or # for period intersections with freq + (not self._is_strictly_monotonic or + not other._is_strictly_monotonic)): + result = Index.intersection(self, other) + if result.empty: + result = result.astype(self.dtype) + freq = self.freq or other.freq + result = self._shallow_copy(result._values, name=result.name, + freq=freq) + if result.freq is None: + result.freq = frequencies.to_offset(result.inferred_freq) + return result + + # Conditions met! + intersected_slice = self._fast_intersection(other) + name = ops.get_op_result_name(self, other) + return self._shallow_copy(intersected_slice, name=name) + def wrap_arithmetic_op(self, other, result): if result is NotImplemented: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ee5f0820a7b3e..8dd1c9b90b39f 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -550,66 +550,6 @@ def _wrap_setop_result(self, other, result): raise ValueError('Passed item and index have different timezone') return self._shallow_copy(result, name=name, freq=None, tz=self.tz) - def intersection(self, other): - """ - Specialized intersection for DatetimeIndex objects. May be much faster - than Index.intersection - - Parameters - ---------- - other : DatetimeIndex or array-like - - Returns - ------- - y : Index or DatetimeIndex - """ - self._assert_can_do_setop(other) - - if self.equals(other): - return self._get_reconciled_name_object(other) - - if not isinstance(other, DatetimeIndex): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - result = Index.intersection(self, other) - if isinstance(result, DatetimeIndex): - if result.freq is None: - result.freq = to_offset(result.inferred_freq) - return result - - elif (other.freq is None or self.freq is None or - other.freq != self.freq or - not other.freq.isAnchored() or - (not self.is_monotonic or not other.is_monotonic)): - result = Index.intersection(self, other) - result = self._shallow_copy(result._values, name=result.name, - tz=result.tz, freq=None) - if result.freq is None: - result.freq = to_offset(result.inferred_freq) - return result - - if len(self) == 0: - return self - if len(other) == 0: - return other - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - end = min(left[-1], right[-1]) - start = right[0] - - if end < start: - return type(self)(data=[]) - else: - lslice = slice(*left.slice_locs(start, end)) - left_chunk = left.values[lslice] - return self._shallow_copy(left_chunk) - # -------------------------------------------------------------------- @Appender(_index_shared_docs['astype']) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index dc1cb29c1ae59..4ea463b1596b2 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -723,7 +723,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): Value of `side` parameter should be validated in caller. """ - assert kind in ['ix', 'loc', 'getitem'] + assert kind in ['ix', 'loc', 'getitem', None] if isinstance(label, datetime): return Period(label, freq=self.freq) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5d52696992c30..f23f729e5843b 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -404,51 +404,9 @@ def _fast_union(self, other): else: return left - def intersection(self, other): - """ - Specialized intersection for TimedeltaIndex objects. May be much faster - than Index.intersection - - Parameters - ---------- - other : TimedeltaIndex or array-like - - Returns - ------- - y : Index or TimedeltaIndex - """ - self._assert_can_do_setop(other) - - if self.equals(other): - return self._get_reconciled_name_object(other) - - if not isinstance(other, TimedeltaIndex): - try: - other = TimedeltaIndex(other) - except (TypeError, ValueError): - pass - result = Index.intersection(self, other) - return result - - if len(self) == 0: - return self - if len(other) == 0: - return other - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - end = min(left[-1], right[-1]) - start = right[0] - - if end < start: - return type(self)(data=[]) - else: - lslice = slice(*left.slice_locs(start, end)) - left_chunk = left.values[lslice] - return self._shallow_copy(left_chunk) + def _wrap_union_result(self, other, result): + name = self.name if self.name == other.name else None + return self._simple_new(result, name=name, freq=None) def _maybe_promote(self, other): if other.inferred_type == 'timedelta': diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 1f7a2eee75750..74959a37dafe6 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -133,7 +133,7 @@ def test_intersection2(self): third = Index(['a', 'b', 'c']) result = first.intersection(third) - expected = pd.Index([], dtype=object) + expected = DatetimeIndex([]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("tz", [None, 'Asia/Tokyo', 'US/Eastern', @@ -151,7 +151,7 @@ def test_intersection(self, tz): expected3 = date_range('6/1/2000', '6/20/2000', freq='D', name=None) rng4 = date_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = DatetimeIndex([], name='idx') + expected4 = DatetimeIndex([], name='idx', freq='D') for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: @@ -181,14 +181,14 @@ def test_intersection(self, tz): # GH 7880 rng4 = date_range('7/1/2000', '7/31/2000', freq='D', tz=tz, name='idx') - expected4 = DatetimeIndex([], tz=tz, name='idx') + expected4 = DatetimeIndex([], tz=tz, name='idx', freq='D') for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: result = base.intersection(rng) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freq is None + assert result.freq == expected.freq assert result.tz == expected.tz def test_intersection_empty(self): diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 45101da78d9c7..ccca159cd9ad2 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd import pandas.util.testing as tm @@ -73,3 +74,119 @@ def test_intersection_bug_1708(self): result = index_1 & index_2 expected = timedelta_range('1 day 01:00:00', periods=3, freq='h') tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('idx1,idx2,expected', [ + (pd.to_timedelta(range(2, 6), unit='s'), + pd.to_timedelta(range(3), unit='s'), + TimedeltaIndex(['00:00:002'])), + (pd.to_timedelta(range(3), unit='s'), + pd.to_timedelta(range(2, 6), unit='s'), + TimedeltaIndex(['00:00:002'])), +]) +def test_intersection_intersects_ascending(idx1, idx2, expected): + result = idx1.intersection(idx2) + assert result.equals(expected) + + +@pytest.mark.parametrize('idx1,idx2,expected', [ + (pd.to_timedelta(range(6, 3, -1), unit='s'), + pd.to_timedelta(range(5, 1, -1), unit='s'), + TimedeltaIndex(['00:00:05', '00:00:04'])), + (pd.to_timedelta(range(5, 1, -1), unit='s'), + pd.to_timedelta(range(6, 3, -1), unit='s'), + TimedeltaIndex(['00:00:05', '00:00:04'])), +]) +def test_intersection_intersects_descending(idx1, idx2, expected): + # GH 17391 + result = idx1.intersection(idx2) + assert result.equals(expected) + + +def test_intersection_intersects_descending_no_intersect(): + idx1 = pd.to_timedelta(range(6, 4, -1), unit='s') + idx2 = pd.to_timedelta(range(4, 1, -1), unit='s') + result = idx1.intersection(idx2) + assert len(result) == 0 + + +def test_intersection_intersects_len_1(): + idx1 = pd.to_timedelta(range(1, 2), unit='s') + idx2 = pd.to_timedelta(range(1, 0, -1), unit='s') + intersection = idx1.intersection(idx2) + expected = TimedeltaIndex(['00:00:01'], + dtype='timedelta64[ns]') + tm.assert_index_equal(intersection, expected) + + +def test_intersection_can_intersect_self(): + idx = pd.to_timedelta(range(1, 2), unit='s') + result = idx.intersection(idx) + tm.assert_index_equal(idx, result) + + +def test_intersection_not_sorted(): + idx1 = pd.to_timedelta((1, 3, 2, 5, 4), unit='s') + idx2 = pd.to_timedelta((1, 2, 3, 5, 4), unit='s') + result = idx1.intersection(idx2) + expected = idx1 + tm.assert_index_equal(result, expected) + + +def test_intersection_not_unique(): + idx1 = pd.to_timedelta((1, 2, 2, 3, 3, 5), unit='s') + idx2 = pd.to_timedelta((1, 2, 3, 4), unit='s') + result = idx1.intersection(idx2) + expected = pd.to_timedelta((1, 2, 2, 3, 3), unit='s') + tm.assert_index_equal(result, expected) + + result = idx2.intersection(idx1) + expected = pd.to_timedelta((1, 2, 2, 3, 3), unit='s') + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("index1, index2, expected", [ + (pd.to_timedelta((1, 2, 3, 4, 5, 6, 7, 8), unit='s'), + pd.to_timedelta((2, 3, 4, 8), unit='s'), + pd.to_timedelta((2, 3, 4, 8), unit='s')), + (pd.to_timedelta((1, 2, 3, 4, 5), unit='s'), + pd.to_timedelta((2, 3, 4), unit='s'), + pd.to_timedelta((2, 3, 4), unit='s')), + (pd.to_timedelta((2, 4, 5, 6), unit='s'), + pd.to_timedelta((2, 3, 4), unit='s'), + pd.to_timedelta((2, 4), unit='s')), +]) +def test_intersection_different_lengths(index1, index2, expected): + def intersect(idx1, idx2, expected): + result = idx1.intersection(idx2) + tm.assert_index_equal(result, expected) + result = idx2.intersection(idx1) + tm.assert_index_equal(result, expected) + + intersect(index1, index2, expected) + intersect(index1.sort_values(ascending=False), + index2.sort_values(ascending=False), + expected.sort_values(ascending=False) + ) + + +@pytest.mark.parametrize("index1, index2, expected", [ + (pd.to_timedelta((2, 4, 5, 6), unit='s'), + pd.to_timedelta((2, 3, 4, 6), unit='s'), + pd.to_timedelta((2, 4, 6), unit='s')), + (pd.to_timedelta((2, 4, 5), unit='s'), + pd.to_timedelta((3, 4, 5, 6), unit='s'), + pd.to_timedelta((4, 5), unit='s')), +]) +def test_intersection_not_a_subset(index1, index2, expected): + def intersect(idx1, idx2, expected): + result = idx1.intersection(idx2) + tm.assert_index_equal(result, expected) + result = idx2.intersection(idx1) + tm.assert_index_equal(result, expected) + + intersect(index1, index2, expected) + intersect(index1.sort_values(ascending=False), + index2.sort_values(ascending=False), + expected.sort_values(ascending=False) + ) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 1c4d00c8b3e15..36069c1608aaa 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4116,6 +4116,7 @@ def test_append_to_multiple_dropna(self): df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan df = concat([df1, df2], axis=1) + print(df) with ensure_clean_store(self.path) as store: diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 45f10a2f06fa2..1ffa779cda776 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -103,6 +103,24 @@ def wrapper(self, other): return wrapper +def index_offsets_equal(first, second): + """ + Checks if the two indexes have an offset, and if they equal each other + Parameters + ---------- + first: Index + second: Index + + Returns + ------- + bool + """ + first = getattr(first, 'freq', None) + second = getattr(second, 'freq', None) + are_offsets_equal = True + if first is None or second is None or first != second: + are_offsets_equal = False + return are_offsets_equal # --------------------------------------------------------------------- # DateOffset