pandas-dev · jorisvandenbossche · Jan 23, 2020 · Jan 15, 2020 · Jan 15, 2020 · Jan 16, 2020
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -727,6 +727,7 @@ Deprecations
 - Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`)
 - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
 - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`)
+- :class:`~DataFrame.diff` will raise a ``TypeError`` rather than implicitly losing the dtype of extension types in the future. Convert to the correct dtype before calling ``diff`` instead (:issue:`31025`)
 
 **Selecting Columns from a Grouped DataFrame**
 
@@ -1018,6 +1019,8 @@ Numeric
 - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
 - Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`)
 - Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
+- Bug in :class:`~DataFrame.diff` losing the dtype for extension types (:issue:`30889`)
+- Bug in :class:`DataFrame.diff` raising an ``IndexError`` when one of the columns was a nullable integer dtype (:issue:`30967`)
 
 Conversion
 ^^^^^^^^^^
@@ -1158,7 +1161,7 @@ Sparse
 ^^^^^^
 - Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`)
 - Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`)
--
+- Fixed :meth:`operator.xor` with a boolean-dtype ``SparseArray``. Now returns a sparse result, rather than object dtype (:issue:`31025`)
 
 ExtensionArray
 ^^^^^^^^^^^^^^

diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in
@@ -84,7 +84,8 @@ def get_op(tup):
                 'ge': '{0} >= {1}',
 
                 'and': '{0} & {1}',     # logical op
-                'or': '{0} | {1}'}
+                'or': '{0} | {1}',
+                'xor': '{0} ^ {1}'}
 
     return ops_dict[opname].format(lval, rval)
 
@@ -94,7 +95,7 @@ def get_dispatch(dtypes):
     ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
                 'floordiv', 'pow',
                 'eq', 'ne', 'lt', 'gt', 'le', 'ge',
-                'and', 'or']
+                'and', 'or', 'xor']
 
     for opname in ops_list:
         for dtype, arith_comp_group, logical_group in dtypes:
@@ -104,13 +105,13 @@ def get_dispatch(dtypes):
             elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
                 # comparison op
                 rdtype = 'uint8'
-            elif opname in ('and', 'or'):
+            elif opname in ('and', 'or', 'xor'):
                 # logical op
                 rdtype = 'uint8'
             else:
                 rdtype = dtype
 
-            if opname in ('and', 'or'):
+            if opname in ('and', 'or', 'xor'):
                 if logical_group:
                     yield opname, dtype, rdtype
             else:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -2,6 +2,7 @@
 Generic data algorithms. This module is experimental at the moment and not
 intended for public consumption
 """
+import operator
 from textwrap import dedent
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 from warnings import catch_warnings, simplefilter, warn
@@ -1812,7 +1813,7 @@ def searchsorted(arr, value, side="left", sorter=None):
 _diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
 
 
-def diff(arr, n: int, axis: int = 0):
+def diff(arr, n: int, axis: int = 0, stacklevel=3):
     """
     difference of n between self,
     analogous to s-s.shift(n)
@@ -1824,16 +1825,42 @@ def diff(arr, n: int, axis: int = 0):
         number of periods
     axis : int
         axis to shift on
+    stacklevel : int
+        The stacklevel for the lost dtype warning.
 
     Returns
     -------
     shifted
     """
+    from pandas.core.arrays import PandasDtype
 
     n = int(n)
     na = np.nan
     dtype = arr.dtype
 
+    if dtype.kind == "b":
+        op = operator.xor
+    else:
+        op = operator.sub
+
+    if isinstance(dtype, PandasDtype):
+        # PandasArray cannot necessarily hold shifted versions of itself.
+        arr = np.asarray(arr)
+        dtype = arr.dtype
+
+    if is_extension_array_dtype(dtype):
+        if hasattr(arr, f"__{op.__name__}__"):
+            return op(arr, arr.shift(n))
+        else:
+            warn(
+                "dtype lost in 'diff()'. In the future this will raise a "
+                "TypeError. Convert to a suitable dtype prior to calling 'diff'.",
+                FutureWarning,
+                stacklevel=stacklevel,
+            )
+            arr = com.values_from_object(arr)
+            dtype = arr.dtype
+
     is_timedelta = False
     is_bool = False
     if needs_i8_conversion(arr):

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -141,7 +141,7 @@ def _sparse_array_op(
             left, right = right, left
             name = name[1:]
 
-        if name in ("and", "or") and dtype == "bool":
+        if name in ("and", "or", "xor") and dtype == "bool":
             opname = f"sparse_{name}_uint8"
             # to make template simple, cast here
             left_sp_values = left.sp_values.view(np.uint8)
@@ -1459,6 +1459,7 @@ def _add_unary_ops(cls):
     def _add_comparison_ops(cls):
         cls.__and__ = cls._create_comparison_method(operator.and_)
         cls.__or__ = cls._create_comparison_method(operator.or_)
+        cls.__xor__ = cls._create_arithmetic_method(operator.xor)
         super()._add_comparison_ops()
 
     # ----------

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6584,6 +6584,11 @@ def diff(self, periods=1, axis=0) -> "DataFrame":
         DataFrame.shift: Shift index by desired number of periods with an
             optional time freq.
 
+        Notes
+        -----
+        For boolean dtypes, this uses :meth:`operator.xor` rather than
+        :meth:`operator.sub`.
+
         Examples
         --------
         Difference with previous row

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1280,7 +1280,10 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
 
     def diff(self, n: int, axis: int = 1) -> List["Block"]:
         """ return block for the diff of the values """
-        new_values = algos.diff(self.values, n, axis=axis)
+        new_values = algos.diff(self.values, n, axis=axis, stacklevel=7)
+        # We use block_shape for ExtensionBlock subclasses, which may call here
+        # via a super.
+        new_values = _block_shape(new_values, ndim=self.ndim)
         return [self.make_block(values=new_values)]
 
     def shift(self, periods, axis=0, fill_value=None):
@@ -1860,6 +1863,12 @@ def interpolate(
             placement=self.mgr_locs,
         )
 
+    def diff(self, n: int, axis: int = 1) -> List["Block"]:
+        if axis == 1:
+            # we are by definition 1D.
+            axis = 0
+        return super().diff(n, axis)
+
     def shift(
         self,
         periods: int,

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2241,6 +2241,11 @@ def diff(self, periods=1) -> "Series":
             optional time freq.
         DataFrame.diff: First discrete difference of object.
 
+        Notes
+        -----
+        For boolean dtypes, this uses :meth:`operator.xor` rather than
+        :meth:`operator.sub`.
+
         Examples
         --------
         Difference with previous row
@@ -2277,7 +2282,7 @@ def diff(self, periods=1) -> "Series":
         5    NaN
         dtype: float64
         """
-        result = algorithms.diff(com.values_from_object(self), periods)
+        result = algorithms.diff(self.array, periods)
         return self._constructor(result, index=self.index).__finalize__(self)
 
     def autocorr(self, lag=1) -> float:

diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py
@@ -90,6 +90,21 @@ def test_isin_empty(empty):
     tm.assert_numpy_array_equal(expected, result)
 
 
+def test_diff():
+    s = pd.Series([1, 2, 3], dtype="category")
+    with tm.assert_produces_warning(FutureWarning):
+        result = s.diff()
+    expected = pd.Series([np.nan, 1, 1])
+    tm.assert_series_equal(result, expected)
+
+    expected = expected.to_frame(name="A")
+    df = s.to_frame(name="A")
+    with tm.assert_produces_warning(FutureWarning):
+        result = df.diff()
+
+    tm.assert_frame_equal(result, expected)
+
+
 class TestTake:
     # https://github.com/pandas-dev/pandas/issues/20664
 

diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py
@@ -388,6 +388,14 @@ def test_mixed_array_comparison(self, kind):
         assert b.dtype == SparseDtype(rdtype, fill_value=2)
         self._check_comparison_ops(a, b, values, rvalues)
 
+    def test_xor(self):
+        s = SparseArray([True, True, False, False])
+        t = SparseArray([True, False, True, False])
+        result = s ^ t
+        sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
+        expected = SparseArray([False, True, True], sparse_index=sp_index)
+        tm.assert_sp_array_equal(result, expected)
+
 
 @pytest.mark.parametrize("op", [operator.eq, operator.add])
 def test_with_list(op):

diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py
@@ -879,3 +879,19 @@ def test_value_counts_na():
     result = arr.value_counts(dropna=True)
     expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
     tm.assert_series_equal(result, expected)
+
+
+def test_diff():
+    a = pd.array(
+        [True, True, False, False, True, None, True, None, False], dtype="boolean"
+    )
+    result = pd.core.algorithms.diff(a, 1)
+    expected = pd.array(
+        [None, False, True, False, True, None, None, None, None], dtype="boolean"
+    )
+    tm.assert_extension_array_equal(result, expected)
+
+    s = pd.Series(a)
+    result = s.diff()
+    expected = pd.Series(expected)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -231,6 +231,28 @@ def test_container_shift(self, data, frame, periods, indices):
 
         compare(result, expected)
 
+    @pytest.mark.parametrize("periods", [1, -2])
+    def test_diff(self, data, periods):
+        data = data[:5]
+        try:
+            # does this array implement ops?
+            data - data
+        except Exception:
+            pytest.skip(f"{type(data)} does not support diff")
+        s = pd.Series(data)
+        result = s.diff(periods)
+        expected = pd.Series(data - data.shift(periods))
+        self.assert_series_equal(result, expected)
+
+        df = pd.DataFrame({"A": data, "B": [1.0] * 5})
+        result = df.diff(periods)
+        if periods == 1:
+            b = [np.nan, 0, 0, 0, 0]
+        else:
+            b = [0, 0, 0, np.nan, np.nan]
+        expected = pd.DataFrame({"A": expected, "B": b})
+        self.assert_frame_equal(result, expected)
+
     @pytest.mark.parametrize(
         "periods, indices",
         [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],

diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
@@ -248,6 +248,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy):
         # Fails creating expected
         super().test_repeat(data, repeats, as_series, use_numpy)
 
+    @pytest.mark.skip(reason="algorithms.diff skips PandasArray")
+    def test_diff(self, data, periods):
+        return super().test_diff(data, periods)
+
 
 @skip_nested
 class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):