Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ Sparse

- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`)
- Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`)

- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`)

Reshaping
^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ def maybe_to_categorical(array):
""" coerce to a categorical if a series is given """
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
return array._values
elif isinstance(array, np.ndarray):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is this hit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in NonConsolidatingBlock._unstack() calling .make_block_same_class(), making a CategoricalBlock from unstacked values (an ndarray). Otherwise fails frame.test_reshape.TestDataFrameReshape.test_unstack_preserve_dtypes.
Since the function is named maybe_to_categorical and accepts argument array, the change seems like making perfect sense.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment to the doc-string that this is only an internal method.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May I prefix it with an underscore? It's used in a single place only.

return Categorical(array)
return array


Expand Down
93 changes: 93 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,6 +1484,35 @@ def equals(self, other):
return False
return array_equivalent(self.values, other.values)

def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self

Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.

Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask

def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
"""
compute the quantiles of the
Expand Down Expand Up @@ -1712,6 +1741,38 @@ def _slice(self, slicer):
def _try_cast_result(self, result, dtype=None):
return result

def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self

Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.

Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
# NonConsolidatable blocks can have a single item only, so we return
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add doc-string

# one block per item
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [self.make_block_same_class(vals, [place])
for vals, place in zip(new_values, new_placement)]
return blocks, mask


class NumericBlock(Block):
__slots__ = ()
Expand Down Expand Up @@ -4167,6 +4228,38 @@ def canonicalize(block):
return all(block.equals(oblock)
for block, oblock in zip(self_blocks, other_blocks))

def unstack(self, unstacker_func):
"""Return a blockmanager with all blocks unstacked.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change unstacker_t as above


Parameters
----------
unstacker_func : callable
A (partially-applied) ``pd.core.reshape._Unstacker`` class.

Returns
-------
unstacked : BlockManager
"""
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
new_columns = dummy.get_new_columns()
new_index = dummy.get_new_index()
new_blocks = []
columns_mask = []

for blk in self.blocks:
blocks, mask = blk._unstack(
partial(unstacker_func,
value_columns=self.items[blk.mgr_locs.indexer]),
new_columns)

new_blocks.extend(blocks)
columns_mask.extend(mask)

new_columns = new_columns[columns_mask]

bm = BlockManager(new_blocks, [new_columns, new_index])
return bm


class SingleBlockManager(BlockManager):
""" manage a single block with """
Expand Down
49 changes: 17 additions & 32 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# pylint: disable=W0703,W0622,W0613,W0201
from pandas.compat import range, text_type, zip
from pandas import compat
from functools import partial
import itertools
import re

Expand All @@ -10,7 +11,7 @@
from pandas.core.dtypes.common import (
_ensure_platform_int,
is_list_like, is_bool_dtype,
needs_i8_conversion)
needs_i8_conversion, is_sparse)
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.missing import notna
import pandas.core.dtypes.concat as _concat
Expand Down Expand Up @@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None,
fill_value=None):

self.is_categorical = None
self.is_sparse = is_sparse(values)
if values.ndim == 1:
if isinstance(values, Categorical):
self.is_categorical = values
values = np.array(values)
elif self.is_sparse:
# XXX: Makes SparseArray *dense*, but it's supposedly
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this a TODO? or a comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment with a mild TODO hint at whomever will be refactoring the whole thing eventually to take the note into consideration.

# a single column at a time, so it's "doable"
values = values.values
values = values[:, np.newaxis]
self.values = values
self.value_columns = value_columns
Expand Down Expand Up @@ -177,7 +183,8 @@ def get_result(self):
ordered=ordered)
for i in range(values.shape[-1])]

return DataFrame(values, index=index, columns=columns)
klass = SparseDataFrame if self.is_sparse else DataFrame
return klass(values, index=index, columns=columns)

def get_new_values(self):
values = self.values
Expand Down Expand Up @@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None):


def _unstack_frame(obj, level, fill_value=None):
from pandas.core.internals import BlockManager, make_block

if obj._is_mixed_type:
unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy
obj.index, level=level,
value_columns=obj.columns)
new_columns = unstacker.get_new_columns()
new_index = unstacker.get_new_index()
new_axes = [new_columns, new_index]

new_blocks = []
mask_blocks = []
for blk in obj._data.blocks:
blk_items = obj._data.items[blk.mgr_locs.indexer]
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
value_columns=blk_items,
fill_value=fill_value)
new_items = bunstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = bunstacker.get_new_values()

mblk = make_block(mask.T, placement=new_placement)
mask_blocks.append(mblk)

newb = make_block(new_values.T, placement=new_placement)
new_blocks.append(newb)

result = DataFrame(BlockManager(new_blocks, new_axes))
mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
return result.loc[:, mask_frame.sum(0) > 0]
unstacker = partial(_Unstacker, index=obj.index,
level=level, fill_value=fill_value)
blocks = obj._data.unstack(unstacker)
klass = type(obj)
return klass(blocks)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
value_columns=obj.columns,
Expand Down Expand Up @@ -559,7 +542,9 @@ def factorize(index):
mask = notna(new_values)
new_values = new_values[mask]
new_index = new_index[mask]
return Series(new_values, index=new_index)

klass = type(frame)._constructor_sliced
return klass(new_values, index=new_index)


def stack_multiple(frame, level, dropna=True):
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/sparse/test_reshape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest
import numpy as np

import pandas as pd
import pandas.util.testing as tm


@pytest.fixture
def sparse_df():
return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye


@pytest.fixture
def multi_index3():
return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])


def test_sparse_frame_stack(sparse_df, multi_index3):
ss = sparse_df.stack()
expected = pd.SparseSeries(np.ones(3), index=multi_index3)
tm.assert_sp_series_equal(ss, expected)


def test_sparse_frame_unstack(sparse_df):
mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)])
sparse_df.index = mi
arr = np.array([[1, np.nan, np.nan],
[np.nan, 1, np.nan],
[np.nan, np.nan, 1]])
unstacked_df = pd.DataFrame(arr, index=mi).unstack()
unstacked_sdf = sparse_df.unstack()

tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values)


def test_sparse_series_unstack(sparse_df, multi_index3):
frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack()
tm.assert_sp_frame_equal(frame, sparse_df)