Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,7 @@ Deprecations
- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`)
- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`)
- :class:`~DataFrame.diff` will raise a ``TypeError`` rather than implicitly losing the dtype of extension types in the future. Convert to the correct dtype before calling ``diff`` instead (:issue:`31025`)

**Selecting Columns from a Grouped DataFrame**

Expand Down Expand Up @@ -1018,6 +1019,8 @@ Numeric
- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
- Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`)
- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
- Bug in :class:`~DataFrame.diff` losing the dtype for extension types (:issue:`30889`)
- Bug in :class:`DataFrame.diff` raising an ``IndexError`` when one of the columns was a nullable integer dtype (:issue:`30967`)

Conversion
^^^^^^^^^^
Expand Down Expand Up @@ -1158,7 +1161,7 @@ Sparse
^^^^^^
- Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`)
- Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`)
-
- Fixed :meth:`operator.xor` with a boolean-dtype ``SparseArray``. Now returns a sparse result, rather than object dtype (:issue:`31025`)

ExtensionArray
^^^^^^^^^^^^^^
Expand Down
9 changes: 5 additions & 4 deletions pandas/_libs/sparse_op_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def get_op(tup):
'ge': '{0} >= {1}',

'and': '{0} & {1}', # logical op
'or': '{0} | {1}'}
'or': '{0} | {1}',
'xor': '{0} ^ {1}'}

return ops_dict[opname].format(lval, rval)

Expand All @@ -94,7 +95,7 @@ def get_dispatch(dtypes):
ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
'floordiv', 'pow',
'eq', 'ne', 'lt', 'gt', 'le', 'ge',
'and', 'or']
'and', 'or', 'xor']

for opname in ops_list:
for dtype, arith_comp_group, logical_group in dtypes:
Expand All @@ -104,13 +105,13 @@ def get_dispatch(dtypes):
elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
# comparison op
rdtype = 'uint8'
elif opname in ('and', 'or'):
elif opname in ('and', 'or', 'xor'):
# logical op
rdtype = 'uint8'
else:
rdtype = dtype

if opname in ('and', 'or'):
if opname in ('and', 'or', 'xor'):
if logical_group:
yield opname, dtype, rdtype
else:
Expand Down
29 changes: 28 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Generic data algorithms. This module is experimental at the moment and not
intended for public consumption
"""
import operator
from textwrap import dedent
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
from warnings import catch_warnings, simplefilter, warn
Expand Down Expand Up @@ -1812,7 +1813,7 @@ def searchsorted(arr, value, side="left", sorter=None):
_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}


def diff(arr, n: int, axis: int = 0):
def diff(arr, n: int, axis: int = 0, stacklevel=3):
"""
difference of n between self,
analogous to s-s.shift(n)
Expand All @@ -1824,16 +1825,42 @@ def diff(arr, n: int, axis: int = 0):
number of periods
axis : int
axis to shift on
stacklevel : int
The stacklevel for the lost dtype warning.

Returns
-------
shifted
"""
from pandas.core.arrays import PandasDtype

n = int(n)
na = np.nan
dtype = arr.dtype

if dtype.kind == "b":
op = operator.xor
else:
op = operator.sub

if isinstance(dtype, PandasDtype):
# PandasArray cannot necessarily hold shifted versions of itself.
arr = np.asarray(arr)
dtype = arr.dtype
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it be more idiomatic to do extract_array up front?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say not because we don't need to "extract" the array, since arrays are already passed (this doesn't get passed Series or Index objects). You can of course use extract_array to get rid of PandasArrays, but I think the above is more explicit.


if is_extension_array_dtype(dtype):
if hasattr(arr, f"__{op.__name__}__"):
return op(arr, arr.shift(n))
else:
warn(
"dtype lost in 'diff()'. In the future this will raise a "
"TypeError. Convert to a suitable dtype prior to calling 'diff'.",
FutureWarning,
stacklevel=stacklevel,
)
arr = com.values_from_object(arr)
dtype = arr.dtype

is_timedelta = False
is_bool = False
if needs_i8_conversion(arr):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _sparse_array_op(
left, right = right, left
name = name[1:]

if name in ("and", "or") and dtype == "bool":
if name in ("and", "or", "xor") and dtype == "bool":
opname = f"sparse_{name}_uint8"
# to make template simple, cast here
left_sp_values = left.sp_values.view(np.uint8)
Expand Down Expand Up @@ -1459,6 +1459,7 @@ def _add_unary_ops(cls):
def _add_comparison_ops(cls):
cls.__and__ = cls._create_comparison_method(operator.and_)
cls.__or__ = cls._create_comparison_method(operator.or_)
cls.__xor__ = cls._create_arithmetic_method(operator.xor)
super()._add_comparison_ops()

# ----------
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6584,6 +6584,11 @@ def diff(self, periods=1, axis=0) -> "DataFrame":
DataFrame.shift: Shift index by desired number of periods with an
optional time freq.

Notes
-----
For boolean dtypes, this uses :meth:`operator.xor` rather than
:meth:`operator.sub`.

Examples
--------
Difference with previous row
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,10 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):

def diff(self, n: int, axis: int = 1) -> List["Block"]:
""" return block for the diff of the values """
new_values = algos.diff(self.values, n, axis=axis)
new_values = algos.diff(self.values, n, axis=axis, stacklevel=7)
# We use block_shape for ExtensionBlock subclasses, which may call here
# via a super.
new_values = _block_shape(new_values, ndim=self.ndim)
return [self.make_block(values=new_values)]

def shift(self, periods, axis=0, fill_value=None):
Expand Down Expand Up @@ -1860,6 +1863,12 @@ def interpolate(
placement=self.mgr_locs,
)

def diff(self, n: int, axis: int = 1) -> List["Block"]:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note this is on ExtensionBlock now. The deprecation for EAs not implementing __sub__ required a similar block / axis handling, so I've moved it up here. The _block_shape is all the way up in Block.diff.

if axis == 1:
# we are by definition 1D.
axis = 0
return super().diff(n, axis)

def shift(
self,
periods: int,
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2241,6 +2241,11 @@ def diff(self, periods=1) -> "Series":
optional time freq.
DataFrame.diff: First discrete difference of object.

Notes
-----
For boolean dtypes, this uses :meth:`operator.xor` rather than
:meth:`operator.sub`.

Examples
--------
Difference with previous row
Expand Down Expand Up @@ -2277,7 +2282,7 @@ def diff(self, periods=1) -> "Series":
5 NaN
dtype: float64
"""
result = algorithms.diff(com.values_from_object(self), periods)
result = algorithms.diff(self.array, periods)
return self._constructor(result, index=self.index).__finalize__(self)

def autocorr(self, lag=1) -> float:
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/arrays/categorical/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,21 @@ def test_isin_empty(empty):
tm.assert_numpy_array_equal(expected, result)


def test_diff():
s = pd.Series([1, 2, 3], dtype="category")
with tm.assert_produces_warning(FutureWarning):
result = s.diff()
expected = pd.Series([np.nan, 1, 1])
tm.assert_series_equal(result, expected)

expected = expected.to_frame(name="A")
df = s.to_frame(name="A")
with tm.assert_produces_warning(FutureWarning):
result = df.diff()

tm.assert_frame_equal(result, expected)


class TestTake:
# https://github.com/pandas-dev/pandas/issues/20664

Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/arrays/sparse/test_arithmetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,14 @@ def test_mixed_array_comparison(self, kind):
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)

def test_xor(self):
s = SparseArray([True, True, False, False])
t = SparseArray([True, False, True, False])
result = s ^ t
sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
expected = SparseArray([False, True, True], sparse_index=sp_index)
tm.assert_sp_array_equal(result, expected)


@pytest.mark.parametrize("op", [operator.eq, operator.add])
def test_with_list(op):
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,3 +879,19 @@ def test_value_counts_na():
result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
tm.assert_series_equal(result, expected)


def test_diff():
a = pd.array(
[True, True, False, False, True, None, True, None, False], dtype="boolean"
)
result = pd.core.algorithms.diff(a, 1)
expected = pd.array(
[None, False, True, False, True, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)

s = pd.Series(a)
result = s.diff()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
22 changes: 22 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,28 @@ def test_container_shift(self, data, frame, periods, indices):

compare(result, expected)

@pytest.mark.parametrize("periods", [1, -2])
def test_diff(self, data, periods):
data = data[:5]
try:
# does this array implement ops?
data - data
except Exception:
pytest.skip(f"{type(data)} does not support diff")
s = pd.Series(data)
result = s.diff(periods)
expected = pd.Series(data - data.shift(periods))
self.assert_series_equal(result, expected)

df = pd.DataFrame({"A": data, "B": [1.0] * 5})
result = df.diff(periods)
if periods == 1:
b = [np.nan, 0, 0, 0, 0]
else:
b = [0, 0, 0, np.nan, np.nan]
expected = pd.DataFrame({"A": expected, "B": b})
self.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"periods, indices",
[[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@ def test_repeat(self, data, repeats, as_series, use_numpy):
# Fails creating expected
super().test_repeat(data, repeats, as_series, use_numpy)

@pytest.mark.skip(reason="algorithms.diff skips PandasArray")
def test_diff(self, data, periods):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this skipped?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either a bug or not implemented behavior in PandasArray. PandasArray.shift() doesn't allow changing the dtype from int to float, so diff (which uses shift) would fail.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe xfail it then?

return super().test_diff(data, periods)


@skip_nested
class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests):
Expand Down