Skip to content

Commit 56ddfb8

Browse files
committed
Merge pull request #762 from shoyer/dataframe-cateogrical
Fix converting a dataframe with categorical column and a multiindex
2 parents a897454 + 2a31c1a commit 56ddfb8

File tree

3 files changed

+29
-3
lines changed

3 files changed

+29
-3
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,16 @@ Bug fixes
2929
- Single dimension variables no longer transpose as part of a broader
3030
``.transpose``. This behavior was causing ``pandas.PeriodIndex`` dimensions
3131
to lose their type (:issue:`749`)
32-
- `~xarray.Dataset` labels remain as their native type on ``.to_dataset``.
32+
- :py:class:`~xarray.Dataset` labels remain as their native type on ``.to_dataset``.
3333
Previously they were coerced to strings (:issue:`745`)
3434
- Fixed a bug where replacing a ``DataArray`` index coordinate would improperly
3535
align the coordinate (:issue:`725`).
3636
- ``DataArray.reindex_like`` now maintains the dtype of complex numbers when
3737
reindexing leads to NaN values (:issue:`738`).
3838
- ``Dataset.rename`` and ``DataArray.rename`` support the old and new names
3939
being the same (:issue:`724`).
40+
- Fix :py:meth:`~xarray.Dataset.from_dataset` for DataFrames with Categorical
41+
column and a MultiIndex index (:issue:`737`).
4042
- Fixes to ensure xarray works properly after the upcoming pandas v0.18 and
4143
NumPy v1.11 releases.
4244

xarray/core/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1905,7 +1905,7 @@ def from_dataframe(cls, dataframe):
19051905
shape = -1
19061906

19071907
for name, series in iteritems(dataframe):
1908-
data = series.values.reshape(shape)
1908+
data = np.asarray(series).reshape(shape)
19091909
obj[name] = (dims, data)
19101910
return obj
19111911

xarray/test/test_dataset.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1722,7 +1722,6 @@ def test_to_and_from_dataframe(self):
17221722
expected = Dataset({'A': DataArray([], dims=('index',))})
17231723
self.assertDatasetIdentical(expected, actual)
17241724

1725-
17261725
# regression test for GH278
17271726
# use int64 to ensure consistent results for the pandas .equals method
17281727
# on windows (which requires the same dtype)
@@ -1741,12 +1740,37 @@ def test_to_and_from_dataframe(self):
17411740
expected = pd.DataFrame([[]], index=idx)
17421741
assert expected.equals(actual), (expected, actual)
17431742

1743+
def test_from_dataframe_non_unique_columns(self):
17441744
# regression test for GH449
17451745
df = pd.DataFrame(np.zeros((2, 2)))
17461746
df.columns = ['foo', 'foo']
17471747
with self.assertRaisesRegexp(ValueError, 'non-unique columns'):
17481748
Dataset.from_dataframe(df)
17491749

1750+
def test_convert_dataframe_with_many_types_and_multiindex(self):
1751+
# regression test for GH737
1752+
df = pd.DataFrame({'a': list('abc'),
1753+
'b': list(range(1, 4)),
1754+
'c': np.arange(3, 6).astype('u1'),
1755+
'd': np.arange(4.0, 7.0, dtype='float64'),
1756+
'e': [True, False, True],
1757+
'f': pd.Categorical(list('abc')),
1758+
'g': pd.date_range('20130101', periods=3),
1759+
'h': pd.date_range('20130101',
1760+
periods=3,
1761+
tz='US/Eastern')})
1762+
df.index = pd.MultiIndex.from_product([['a'], range(3)],
1763+
names=['one', 'two'])
1764+
roundtripped = Dataset.from_dataframe(df).to_dataframe()
1765+
# we can't do perfectly, but we should be at least as faithful as
1766+
# np.asarray
1767+
expected = df.apply(np.asarray)
1768+
if pd.__version__ < '0.17':
1769+
# datetime with timezone dtype is not consistent on old pandas
1770+
roundtripped = roundtripped.drop(['h'], axis=1)
1771+
expected = expected.drop(['h'], axis=1)
1772+
assert roundtripped.equals(expected)
1773+
17501774
def test_pickle(self):
17511775
data = create_test_data()
17521776
roundtripped = pickle.loads(pickle.dumps(data))

0 commit comments

Comments
 (0)