diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index e67ca3b199369..84f79a0c0685d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -323,7 +323,24 @@ These changes allow pandas to handle sparse data with more dtypes, and for work s + 1 +- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) +.. ipython:: python + + s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s + s.astype(np.int64) + +``astype`` fails if data contains values which cannot be converted to specified ``dtype``. +Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. + +.. code-block:: ipython + + In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) + Out[7]: + ValueError: unable to coerce current fill_value nan to int64 dtype + +- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) @@ -411,7 +428,7 @@ API changes - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) -- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) + diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8e77486457546..83fba7a0ce8b5 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2504,6 +2504,14 @@ def sp_index(self): def kind(self): return self.values.kind + def _astype(self, dtype, copy=False, raise_on_error=True, values=None, + klass=None, mgr=None, **kwargs): + if values is None: + values = self.values + values = values.astype(dtype, copy=copy) + return self.make_block_same_class(values=values, + placement=self.mgr_locs) + def __len__(self): try: return self.sp_index.length @@ -2521,7 +2529,7 @@ def make_block_same_class(self, values, placement, sparse_index=None, copy=False, fastpath=True, **kwargs): """ return a new block """ if dtype is None: - dtype = self.dtype + dtype = values.dtype if fill_value is None and not isinstance(values, SparseArray): fill_value = self.values.fill_value diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 8aebb19d5b93e..e22a62ee7f917 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -18,8 +18,9 @@ from pandas.types.common import (is_float, is_integer, is_integer_dtype, _ensure_platform_int, is_list_like, - is_scalar) -from pandas.types.cast import _possibly_convert_platform + is_scalar, is_dtype_equal) +from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, + _astype_nansafe) from pandas.types.missing import isnull, notnull from pandas._sparse import SparseIndex, BlockIndex, IntIndex @@ -236,7 +237,7 @@ def _simple_new(cls, data, sp_index, fill_value): raise ValueError('sp_index must be a SparseIndex') result.sp_index = sp_index - result.fill_value = fill_value + result._fill_value = fill_value return result @property @@ -285,7 +286,7 @@ def __array_finalize__(self, obj): to pass on the index. """ self.sp_index = getattr(obj, 'sp_index', None) - self.fill_value = getattr(obj, 'fill_value', None) + self._fill_value = getattr(obj, 'fill_value', None) def __reduce__(self): """Necessary for making this object picklable""" @@ -301,7 +302,7 @@ def __setstate__(self, state): fill_value, sp_index = own_state[:2] self.sp_index = sp_index - self.fill_value = fill_value + self._fill_value = fill_value def __len__(self): try: @@ -344,6 +345,22 @@ def sp_values(self): # caching not an option, leaks memory return self.view(np.ndarray) + @property + def fill_value(self): + return self._fill_value + + @fill_value.setter + def fill_value(self, value): + if not is_scalar(value): + raise ValueError('fill_value must be a scalar') + # if the specified value triggers type promotion, raise ValueError + new_dtype, fill_value = _maybe_promote(self.dtype, value) + if is_dtype_equal(self.dtype, new_dtype): + self._fill_value = fill_value + else: + msg = 'unable to set fill_value {0} to {1} dtype' + raise ValueError(msg.format(value, self.dtype)) + def get_values(self, fill=None): """ return a dense representation """ return self.to_dense(fill=fill) @@ -479,19 +496,16 @@ def __setslice__(self, i, j, value): raise TypeError("SparseArray does not support item assignment via " "slices") - def astype(self, dtype=None): - """ - - """ + def astype(self, dtype=None, copy=True): dtype = np.dtype(dtype) - if dtype is not None and dtype not in (np.float_, float): - raise TypeError('Can only support floating point data for now') - - if self.dtype == dtype: - return self.copy() - else: - return self._simple_new(self.sp_values.astype(dtype), - self.sp_index, float(self.fill_value)) + sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy) + try: + fill_value = dtype.type(self.fill_value) + except ValueError: + msg = 'unable to coerce current fill_value {0} to {1} dtype' + raise ValueError(msg.format(self.fill_value, dtype)) + return self._simple_new(sp_values, self.sp_index, + fill_value=fill_value) def copy(self, deep=True): """ diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index b6a1e1e48c5c4..f382a4b869a3e 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -235,8 +235,19 @@ def to_dense(self): data = dict((k, v.to_dense()) for k, v in compat.iteritems(self)) return DataFrame(data, index=self.index, columns=self.columns) + def _apply_columns(self, func): + """ get new SparseDataFrame applying func to each columns """ + + new_data = {} + for col, series in compat.iteritems(self): + new_data[col] = func(series) + + return self._constructor( + data=new_data, index=self.index, columns=self.columns, + default_fill_value=self.default_fill_value).__finalize__(self) + def astype(self, dtype): - raise NotImplementedError + return self._apply_columns(lambda x: x.astype(dtype)) def copy(self, deep=True): """ @@ -499,13 +510,7 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): default_fill_value=self.default_fill_value).__finalize__(self) def _combine_const(self, other, func): - new_data = {} - for col, series in compat.iteritems(self): - new_data[col] = func(series, other) - - return self._constructor( - data=new_data, index=self.index, columns=self.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + return self._apply_columns(lambda x: func(x, other)) def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 2f12b9fba1842..70cda5acc3f4c 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -324,7 +324,68 @@ def test_astype(self): res.sp_values[:3] = 27 self.assertFalse((self.arr.sp_values[:3] == 27).any()) - assertRaisesRegexp(TypeError, "floating point", self.arr.astype, 'i8') + msg = "unable to coerce current fill_value nan to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + self.arr.astype('i8') + + arr = SparseArray([0, np.nan, 0, 1]) + with tm.assertRaisesRegexp(ValueError, msg): + arr.astype('i8') + + arr = SparseArray([0, np.nan, 0, 1], fill_value=0) + msg = "Cannot convert NA to integer" + with tm.assertRaisesRegexp(ValueError, msg): + arr.astype('i8') + + def test_astype_all(self): + vals = np.array([1, 2, 3]) + arr = SparseArray(vals, fill_value=1) + + types = [np.float64, np.float32, np.int64, + np.int32, np.int16, np.int8] + for typ in types: + res = arr.astype(typ) + self.assertEqual(res.dtype, typ) + self.assertEqual(res.sp_values.dtype, typ) + + tm.assert_numpy_array_equal(res.values, vals.astype(typ)) + + def test_set_fill_value(self): + arr = SparseArray([1., np.nan, 2.], fill_value=np.nan) + arr.fill_value = 2 + self.assertEqual(arr.fill_value, 2) + + arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64) + arr.fill_value = 2 + self.assertEqual(arr.fill_value, 2) + + # coerces to int + msg = "unable to set fill_value 3\\.1 to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = 3.1 + + msg = "unable to set fill_value nan to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = np.nan + + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) + arr.fill_value = True + self.assertTrue(arr.fill_value) + + # coerces to bool + msg = "unable to set fill_value 0 to bool dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = 0 + + msg = "unable to set fill_value nan to bool dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = np.nan + + # invalid + msg = "fill_value must be a scalar" + for val in [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]: + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = val def test_copy_shallow(self): arr2 = self.arr.copy(deep=False) diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 9514f9322f68e..67b108c5dc648 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -15,7 +15,7 @@ import pandas.sparse.frame as spf from pandas._sparse import BlockIndex, IntIndex -from pandas.sparse.api import SparseSeries, SparseDataFrame +from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray from pandas.tests.frame.test_misc_api import SharedWithSparse @@ -588,7 +588,59 @@ def test_applymap(self): tm.assertIsInstance(result, SparseDataFrame) def test_astype(self): - self.assertRaises(Exception, self.frame.astype, np.int64) + sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], + dtype=np.int64), + 'B': SparseArray([4, 5, 6, 7], + dtype=np.int64)}) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(np.float64) + exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.]), + 'B': SparseArray([4., 5., 6., 7.])}, + default_fill_value=np.nan) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.float64) + self.assertEqual(res['B'].dtype, np.float64) + + sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], + dtype=np.int64), + 'B': SparseArray([0, 5, 0, 7], + dtype=np.int64)}, + default_fill_value=0) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(np.float64) + exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.]), + 'B': SparseArray([0., 5., 0., 7.])}, + default_fill_value=0.) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.float64) + self.assertEqual(res['B'].dtype, np.float64) + + def test_astype_bool(self): + sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], + fill_value=0, + dtype=np.int64), + 'B': SparseArray([0, 5, 0, 7], + fill_value=0, + dtype=np.int64)}, + default_fill_value=0) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(bool) + exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True], + dtype=np.bool, + fill_value=False), + 'B': SparseArray([False, True, False, True], + dtype=np.bool, + fill_value=False)}, + default_fill_value=False) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.bool) + self.assertEqual(res['B'].dtype, np.bool) def test_fillna(self): df = self.zframe.reindex(lrange(5)) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 9c792b4171b49..95361a8899c46 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -797,7 +797,8 @@ def test_fill_value_corner(self): cop2 = self.zbseries.copy() cop2.fill_value = 1 result = cop2 / cop - self.assertEqual(result.fill_value, np.inf) + # 1 / 0 is inf + self.assertTrue(np.isinf(result.fill_value)) def test_fill_value_when_combine_const(self): # GH12723