diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a6b74865f6619..2e4959b44aeeb 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -316,6 +316,68 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t s s.str.startswith(b'a') +<<<<<<< HEAD +======= +.. _whatsnew_0250.api_breaking.ufuncs: + +ufuncs on Extension Dtype +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Operations with ``numpy`` ufuncs on DataFrames with Extension Arrays, including Sparse Dtypes will now preserve the +resulting dtypes to same as the input dtype; previously this would coerce to a dense dtype. (:issue:`23743`) + +.. ipython:: python + + df = pd.DataFrame( + {'A': pd.Series([1, np.nan, 3], + dtype=pd.SparseDtype('float64', np.nan))}) + df + df.dtypes + +*Previous Behavior*: + +.. code-block:: python + + In [3]: np.sqrt(df).dtypes + Out[3]: + A float64 + dtype: object + +*New Behavior*: + +.. ipython:: python + + np.sqrt(df).dtypes + +.. _whatsnew_0250.api_breaking.groupby_categorical: + +Categorical dtypes are preserved during groupby +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`) + +.. ipython:: python + + df = pd.DataFrame( + {'payload': [-1, -2, -1, -2], + 'col': pd.Categorical(["foo", "bar", "bar", "qux"], ordered=True)}) + df + df.dtypes + +*Previous Behavior*: + +.. code-block:: python + + In [5]: df.groupby('payload').first().col.dtype + Out[5]: dtype('O') + +*New Behavior*: + +.. ipython:: python + + df.groupby('payload').first().col.dtype + + .. _whatsnew_0250.api_breaking.incompatible_index_unions: Incompatible Index Type Unions diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 3dda6868a80da..490df5b250f74 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -25,7 +25,8 @@ infer_dtype_from_scalar) from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, - is_integer, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype) + is_float_dtype, is_integer, is_integer_dtype, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ( ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries) @@ -1926,8 +1927,28 @@ def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): index = _make_index(length, indices, kind) sparsified_values = arr[mask] + if dtype is not None: - sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) + + # careful about casting here as we could easily specify a type that + # cannot hold the resulting values, e.g. integer when we have floats + # if this is not safe then convert the dtype; note that if there are + # nan's in the source array this will raise + + # TODO: ideally this would be done by 'safe' casting in astype_nansafe + # but alas too many cases rely upon this working in the current way + # and casting='safe' doesn't really work in numpy properly + if is_integer_dtype(dtype) and is_float_dtype(sparsified_values.dtype): + result = astype_nansafe( + sparsified_values, dtype=dtype) + if np.allclose(result, sparsified_values, rtol=0): + return result, index, fill_value + + dtype = find_common_type([dtype, sparsified_values.dtype]) + + sparsified_values = astype_nansafe( + sparsified_values, dtype=dtype) + # TODO: copy return sparsified_values, index, fill_value diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2f66e9ed46aa0..0268f8fbdf467 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -605,7 +605,7 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy=True, skipna=False): +def astype_nansafe(arr, dtype, copy=True, skipna=False, casting='unsafe'): """ Cast the elements of an array to a given dtype a nan-safe manner. @@ -616,8 +616,10 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): copy : bool, default True If False, a view will be attempted but may fail, if e.g. the item sizes don't align. - skipna: bool, default False + skipna : bool, default False Whether or not we should skip NaN when casting as a string-type. + casting : {‘no’, ‘equiv’, ‘safe’, ‘same_kind’, ‘unsafe’} + optional, default 'unsafe' Raises ------ @@ -703,7 +705,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): if copy or is_object_dtype(arr) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. - return arr.astype(dtype, copy=True) + return arr.astype(dtype, copy=True, casting=casting) return arr.view(dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6746844f4b1fa..530590ea5dc45 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -16,7 +16,7 @@ import sys import warnings from textwrap import dedent -from typing import FrozenSet, List, Optional, Set, Type, Union +from typing import FrozenSet, List, Optional, Tuple, Set, Type, Union import numpy as np import numpy.ma as ma @@ -2641,6 +2641,52 @@ def transpose(self, *args, **kwargs): T = property(transpose) + # ---------------------------------------------------------------------- + # Array Interface + + # This is also set in IndexOpsMixin + # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented + __array_priority__ = 1000 + + def __array__(self, dtype=None): + return com.values_from_object(self) + + def __array_wrap__(self, result: np.ndarray, + context: Optional[Tuple] = None) -> 'DataFrame': + """ + We are called post ufunc; reconstruct the original object and dtypes. + + Parameters + ---------- + result : np.ndarray + context : tuple, optional + + Returns + ------- + DataFrame + """ + + d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) + result = self._constructor(result, **d) + + # we try to cast extension array types back to the original + # TODO: this fails with duplicates, ugh + if self._data.any_extension_types: + result = result.astype(self.dtypes, + copy=False, + errors='ignore', + casting='same_kind') + + return result.__finalize__(self) + + # ideally we would define this to avoid the getattr checks, but + # is slower + # @property + # def __array_interface__(self): + # """ provide numpy array interface method """ + # values = self.values + # return dict(typestr=values.dtype.str,shape=values.shape,data=values) + # ---------------------------------------------------------------------- # Picklability diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 360576ffdb00a..b614298bb912c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1919,25 +1919,6 @@ def empty(self): # ---------------------------------------------------------------------- # Array Interface - # This is also set in IndexOpsMixin - # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented - __array_priority__ = 1000 - - def __array__(self, dtype=None): - return com.values_from_object(self) - - def __array_wrap__(self, result, context=None): - d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) - return self._constructor(result, **d).__finalize__(self) - - # ideally we would define this to avoid the getattr checks, but - # is slower - # @property - # def __array_interface__(self): - # """ provide numpy array interface method """ - # values = self.values - # return dict(typestr=values.dtype.str,shape=values.shape,data=values) - def to_dense(self): """ Return dense representation of NDFrame (as opposed to sparse). @@ -5693,6 +5674,11 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): **kwargs) return self._constructor(new_data).__finalize__(self) + if not results: + if copy: + self = self.copy() + return self + # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) result.columns = self.columns diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35ffa552913ae..20b7a595f49e9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -156,12 +156,19 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, obj = self.obj[data.items[locs]] s = groupby(obj, self.grouper) - result = s.aggregate(lambda x: alt(x, axis=self.axis)) + try: + result = s.aggregate(lambda x: alt(x, axis=self.axis)) + except Exception: + # we may have an exception in trying to aggregate + # continue and exclude the block + pass finally: + dtype = block.values.dtype + # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result) + result = block._try_coerce_and_cast_result(result, dtype=dtype) newb = block.make_block(result) new_items.append(locs) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 43950f2f503c8..e067185e7ce94 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -786,6 +786,8 @@ def _try_cast(self, result, obj, numeric_only=False): elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. + + # return the same type (Series) as our caller try: result = obj._values._from_sequence(result, dtype=dtype) except Exception: @@ -1157,7 +1159,8 @@ def mean(self, *args, **kwargs): """ nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) try: - return self._cython_agg_general('mean', **kwargs) + return self._cython_agg_general( + 'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1179,7 +1182,11 @@ def median(self, **kwargs): Median of values within each group. """ try: - return self._cython_agg_general('median', **kwargs) + return self._cython_agg_general( + 'median', + alt=lambda x, + axis: Series(x).median(axis=axis, **kwargs), + **kwargs) except GroupByError: raise except Exception: # pragma: no cover @@ -1235,7 +1242,10 @@ def var(self, ddof=1, *args, **kwargs): nv.validate_groupby_func('var', args, kwargs) if ddof == 1: try: - return self._cython_agg_general('var', **kwargs) + return self._cython_agg_general( + 'var', + alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), + **kwargs) except Exception: f = lambda x: x.var(ddof=ddof, **kwargs) with _group_selection_context(self): @@ -1263,7 +1273,6 @@ def sem(self, ddof=1): Series or DataFrame Standard error of the mean of values within each group. """ - return self.std(ddof=ddof) / np.sqrt(self.count()) @Substitution(name='groupby') @@ -1290,7 +1299,7 @@ def _add_numeric_operations(cls): """ def groupby_function(name, alias, npfunc, - numeric_only=True, _convert=False, + numeric_only=True, min_count=-1): _local_template = """ @@ -1312,17 +1321,30 @@ def f(self, **kwargs): kwargs['min_count'] = min_count self._set_group_selection() + + # try a cython aggregation if we can try: return self._cython_agg_general( alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: - result = self.aggregate( - lambda x: npfunc(x, axis=self.axis)) - if _convert: - result = result._convert(datetime=True) - return result + pass + + # apply a non-cython aggregation + result = self.aggregate( + lambda x: npfunc(x, axis=self.axis)) + + # coerce the resulting columns if we can + if isinstance(result, DataFrame): + for col in result.columns: + result[col] = self._try_cast( + result[col], self.obj[col]) + else: + result = self._try_cast( + result, self.obj) + + return result set_function_name(f, name, cls) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 010047a8be4ed..38478be5a8e07 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -19,7 +19,7 @@ from pandas.core.dtypes.common import ( ensure_float64, ensure_int64, ensure_int_or_float, ensure_object, ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, + is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, is_timedelta64_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import _maybe_fill, isna @@ -451,9 +451,9 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, # categoricals are only 1d, so we # are not setup for dim transforming - if is_categorical_dtype(values): + if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( - "categoricals are not support in cython ops ATM") + "{} are not support in cython ops".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4cc6c86417b3b..dfb5c458b0d77 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -600,7 +600,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only - values = astype_nansafe(values.ravel(), dtype, copy=True) + values = astype_nansafe( + values.ravel(), dtype, copy=True, **kwargs) # TODO(extension) # should we make this attribute? @@ -1767,6 +1768,27 @@ def _slice(self, slicer): return self.values[slicer] + def _try_cast_result(self, result, dtype=None): + """ + if we have an operation that operates on for example floats + we want to try to cast back to our EA here if possible + + result could be a 2-D numpy array, e.g. the result of + a numeric operation; but it must be shape (1, X) because + we by-definition operate on the ExtensionBlocks one-by-one + + result could also be an EA Array itself, in which case it + is already a 1-D array + """ + try: + + result = self._holder._from_sequence( + np.asarray(result).ravel(), dtype=dtype) + except Exception: + pass + + return result + def formatting_values(self): # Deprecating the ability to override _formatting_values. # Do the warning here, it's only user in pandas, since we diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2616f0aa97d0d..8e1609c1364fd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -666,7 +666,10 @@ def sanitize_array(data, index, dtype=None, copy=False, data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - if is_object_dtype(subarr.dtype) and dtype != 'object': + if (not (is_extension_array_dtype(subarr.dtype) or + is_extension_array_dtype(dtype)) and + is_object_dtype(subarr.dtype) and + not is_object_dtype(dtype)): inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 7923e463c7719..24a28bf0005cb 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -72,11 +72,12 @@ def _f(*args, **kwargs): class bottleneck_switch: - def __init__(self, **kwargs): + def __init__(self, name=None, **kwargs): + self.name = name self.kwargs = kwargs def __call__(self, alt): - bn_name = alt.__name__ + bn_name = self.name or alt.__name__ try: bn_func = getattr(bn, bn_name) @@ -804,7 +805,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): - @bottleneck_switch() + + @bottleneck_switch(name='nan' + meth) def reduction(values, axis=None, skipna=True, mask=None): values, mask, dtype, dtype_max, fill_value = _get_values( @@ -824,7 +826,6 @@ def reduction(values, axis=None, skipna=True, mask=None): result = _wrap_results(result, dtype, fill_value) return _maybe_null_out(result, axis, mask, values.shape) - reduction.__name__ = 'nan' + meth return reduction diff --git a/pandas/core/series.py b/pandas/core/series.py index c4a449154860f..2432d801fe07e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,6 +5,7 @@ from io import StringIO from shutil import get_terminal_size from textwrap import dedent +from typing import Optional, Tuple import warnings import numpy as np @@ -762,12 +763,28 @@ def __array__(self, dtype=None): dtype = 'M8[ns]' return np.asarray(self.array, dtype) - def __array_wrap__(self, result, context=None): + def __array_wrap__(self, result: np.ndarray, + context: Optional[Tuple] = None) -> 'Series': """ - Gets called after a ufunc. + We are called post ufunc; reconstruct the original object and dtypes. + + Parameters + ---------- + result : np.ndarray + context : tuple, optional + + Returns + ------- + Series """ - return self._constructor(result, index=self.index, - copy=False).__finalize__(self) + result = self._constructor(result, index=self.index, + copy=False) + + # we try to cast extension array types back to the original + if is_extension_array_dtype(self): + result = result.astype(self.dtype, copy=False) + + return result.__finalize__(self) def __array_prepare__(self, result, context=None): """ diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 67ecbcbea67f9..778fff249817d 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -284,20 +284,26 @@ def _unpickle_sparse_frame_compat(self, state): def to_dense(self): return SparseFrameAccessor(self).to_dense() - def _apply_columns(self, func): + def _apply_columns(self, func, *args, **kwargs): """ Get new SparseDataFrame applying func to each columns """ - new_data = {col: func(series) + new_data = {col: func(series, *args, **kwargs) for col, series in self.items()} return self._constructor( data=new_data, index=self.index, columns=self.columns, default_fill_value=self.default_fill_value).__finalize__(self) - def astype(self, dtype): - return self._apply_columns(lambda x: x.astype(dtype)) + def astype(self, dtype, **kwargs): + + def f(x, dtype, **kwargs): + if isinstance(dtype, (dict, Series)): + dtype = dtype[x.name] + return x.astype(dtype, **kwargs) + + return self._apply_columns(f, dtype=dtype, **kwargs) def copy(self, deep=True): """ diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 231b5a92dbb3a..69259c66d61dd 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -472,6 +472,7 @@ def test_astype(self): # float -> float arr = SparseArray([None, None, 0, 2]) result = arr.astype("Sparse[float32]") + expected = SparseArray([None, None, 0, 2], dtype=np.dtype('float32')) tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3d9bfcd126377..355da1151d878 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -12,7 +12,7 @@ from pandas import ( DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna) import pandas.core.nanops as nanops -from pandas.util import testing as tm +from pandas.util import _test_decorators as td, testing as tm @pytest.mark.parametrize("agg_func", ['any', 'all']) @@ -144,6 +144,7 @@ def test_arg_passthru(): index=Index([1, 2], name='group'), columns=['int', 'float', 'category_int', 'datetime', 'datetimetz', 'timedelta']) + for attr in ['mean', 'median']: f = getattr(df.groupby('group'), attr) result = f() @@ -459,35 +460,33 @@ def test_groupby_cumprod(): tm.assert_series_equal(actual, expected) -def test_ops_general(): - ops = [('mean', np.mean), - ('median', np.median), - ('std', np.std), - ('var', np.var), - ('sum', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ('count', np.size), ] - try: - from scipy.stats import sem - except ImportError: - pass - else: - ops.append(('sem', sem)) +def scipy_sem(*args, **kwargs): + from scipy.stats import sem + return sem(*args, ddof=1, **kwargs) + + +@pytest.mark.parametrize( + 'op,targop', + [('mean', np.mean), + ('median', np.median), + ('std', np.std), + ('var', np.var), + ('sum', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), + pytest.param( + 'sem', scipy_sem, marks=td.skip_if_no_scipy)]) +def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - for op, targop in ops: - result = getattr(df.groupby(labels), op)().astype(float) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) def test_max_nan_bug(): diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 6a08a8d79b63e..b174fb0e0b6f9 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -282,18 +282,21 @@ def test_first_last_tz(data, expected_first, expected_last): ]) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 + category_string = pd.Series(list('abc')).astype( + 'category') df = pd.DataFrame({'group': [1, 1, 2], - 'category_string': pd.Series(list('abc')).astype( - 'category'), + 'category_string': category_string, 'datetimetz': pd.date_range('20130101', periods=3, tz='US/Eastern')}) result = getattr(df.groupby('group'), method)() - expepcted = pd.DataFrame({'category_string': [alpha, 'c'], - 'datetimetz': [ts, - Timestamp('2013-01-03', - tz='US/Eastern')]}, - index=pd.Index([1, 2], name='group')) - assert_frame_equal(result, expepcted) + expected = pd.DataFrame( + {'category_string': pd.Categorical( + [alpha, 'c'], dtype=category_string.dtype), + 'datetimetz': [ts, + Timestamp('2013-01-03', + tz='US/Eastern')]}, + index=pd.Index([1, 2], name='group')) + assert_frame_equal(result, expected) def test_nth_multi_index_as_expected(): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 5711174ef0c9f..830ba6062cc72 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -112,6 +112,12 @@ def test_resample_integerarray(): dtype="Int64") assert_series_equal(result, expected) + result = ts.resample('3T').mean() + expected = Series([1, 4, 7], + index=pd.date_range('1/1/2000', periods=3, freq='3T'), + dtype='Int64') + assert_series_equal(result, expected) + def test_resample_basic_grouper(series): s = series diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py index ae97682f297ad..52fcf7c355cf2 100644 --- a/pandas/tests/sparse/frame/test_analytics.py +++ b/pandas/tests/sparse/frame/test_analytics.py @@ -1,7 +1,8 @@ import numpy as np import pytest -from pandas import DataFrame, SparseDataFrame, SparseSeries +from pandas import ( + DataFrame, Series, SparseDataFrame, SparseDtype, SparseSeries) from pandas.util import testing as tm @@ -39,3 +40,20 @@ def test_quantile_multi(): tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) + + +@pytest.mark.parametrize( + 'data, dtype', + [([1, np.nan, 3], SparseDtype('float64', np.nan)), + ([1, 2, 3], SparseDtype('int'))]) +@pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=str) +def test_ufunc(data, dtype, func): + # GH 23743 + # assert we preserve the incoming dtype on ufunc operation + df = DataFrame( + {'A': Series(data, dtype=dtype)}) + result = func(df) + expected = DataFrame( + {'A': Series(func(data), + dtype=SparseDtype('float64', dtype.fill_value))}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/sparse/series/test_analytics.py b/pandas/tests/sparse/series/test_analytics.py new file mode 100644 index 0000000000000..bf04f5b52a371 --- /dev/null +++ b/pandas/tests/sparse/series/test_analytics.py @@ -0,0 +1,20 @@ +import numpy as np +import pytest + +from pandas import Series, SparseDtype +from pandas.util import testing as tm + + +@pytest.mark.parametrize( + 'data, dtype', + [([1, np.nan, 3], SparseDtype('float64', np.nan)), + ([1, 2, 3], SparseDtype('int'))]) +@pytest.mark.parametrize('func', [np.exp, np.sqrt], ids=str) +def test_ufunc(data, dtype, func): + # GH 23743 + # assert we preserve the incoming dtype on ufunc operation + s = Series(data, dtype=dtype) + result = func(s) + expected = Series(func(data), + dtype=SparseDtype('float64', dtype.fill_value)) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index 531a4360c78a2..bf6055bc12725 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -29,11 +29,10 @@ def test_first_last_nth(self): sparse_grouped_last = sparse_grouped.last() sparse_grouped_nth = sparse_grouped.nth(1) - dense_grouped_first = dense_grouped.first().to_sparse() - dense_grouped_last = dense_grouped.last().to_sparse() - dense_grouped_nth = dense_grouped.nth(1).to_sparse() + dense_grouped_first = pd.DataFrame(dense_grouped.first().to_sparse()) + dense_grouped_last = pd.DataFrame(dense_grouped.last().to_sparse()) + dense_grouped_nth = pd.DataFrame(dense_grouped.nth(1).to_sparse()) - # TODO: shouldn't these all be spares or not? tm.assert_frame_equal(sparse_grouped_first, dense_grouped_first) tm.assert_frame_equal(sparse_grouped_last, @@ -69,5 +68,6 @@ def test_groupby_includes_fill_value(fill_value): 'b': [fill_value, 1, fill_value, fill_value]}) sdf = df.to_sparse(fill_value=fill_value) result = sdf.groupby('a').sum() - expected = df.groupby('a').sum().to_sparse(fill_value=fill_value) + expected = pd.DataFrame(df.groupby('a').sum().to_sparse( + fill_value=fill_value)) tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 114e7b4bacd94..f545ce0310a2a 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -2,6 +2,7 @@ import pytest import pandas as pd +from pandas import _np_version_under1p17 import pandas.util.testing as tm @@ -47,10 +48,24 @@ def test_pivot_table(self): # values='E', aggfunc='sum') # tm.assert_frame_equal(res_sparse, res_dense) - def test_pivot_table_multi(self): - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values=['D', 'E']) - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values=['D', 'E']) + @pytest.mark.parametrize( + 'func', + ['mean', + 'std', + 'var', + 'sem', + pytest.param('median', marks=pytest.mark.xfail( + not _np_version_under1p17, reason="fails on numpy > 1.16")), + 'first', + 'last']) + @pytest.mark.parametrize('dropna', [True, False]) + def test_pivot_table_multi(self, func, dropna): + + res_sparse = pd.pivot_table( + self.sparse, index='A', columns='B', + values=['D', 'E'], aggfunc=func, dropna=dropna) + res_dense = pd.pivot_table( + self.dense, index='A', columns='B', + values=['D', 'E'], aggfunc=func, dropna=dropna) res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) tm.assert_frame_equal(res_sparse, res_dense)