diff --git a/doc/source/release.rst b/doc/source/release.rst index 3d4ff0610f43f..58232e226e277 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -292,6 +292,7 @@ Bug Fixes - Bug in resample with extra bins when using an evenly divisible frequency (:issue:`4076`) - Bug in consistency of groupby aggregation when passing a custom function (:issue:`6715`) - Bug in resample when ``how=None`` resample freq is the same as the axis frequency (:issue:`5955`) +- Bug in downcasting inference with empty arrays (:issue:`6733`) pandas 0.13.1 ------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index daeb43c7e76ac..b33ee6d66f901 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1084,7 +1084,7 @@ def _possibly_downcast_to_dtype(result, dtype): or could be an astype of float64->float32 """ - if np.isscalar(result) or not len(result): + if np.isscalar(result): return result trans = lambda x: x @@ -1114,15 +1114,19 @@ def _possibly_downcast_to_dtype(result, dtype): try: - # don't allow upcasts here + # don't allow upcasts here (except if empty) if dtype.kind == result.dtype.kind: - if result.dtype.itemsize <= dtype.itemsize: + if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): return result if issubclass(dtype.type, np.floating): return result.astype(dtype) elif dtype == np.bool_ or issubclass(dtype.type, np.integer): + # if we don't have any elements, just astype it + if not np.prod(result.shape): + return trans(result).astype(dtype) + # do a test on the first element, if it fails then we are done r = result.ravel() arr = np.array([r[0]]) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 59bfce8d9d636..7185b684a1e12 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -166,6 +166,12 @@ def test_downcast_conv(): result = com._possibly_downcast_to_dtype(arr,'infer') tm.assert_almost_equal(result, expected) + # empties + for dtype in [np.int32,np.float64,np.float32,np.bool_,np.int64,object]: + arr = np.array([],dtype=dtype) + result = com._possibly_downcast_to_dtype(arr,'int64') + tm.assert_almost_equal(result, np.array([],dtype=np.int64)) + assert result.dtype == np.int64 def test_array_equivalent(): assert array_equivalent(np.array([np.nan, np.nan]), @@ -182,10 +188,10 @@ def test_array_equivalent(): np.array([np.nan, 2, np.nan])) assert not array_equivalent(np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan])) - assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) + assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) - + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 79eac770f547e..3ff40b1adf156 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2237,6 +2237,14 @@ def test_groupby_aggregation_mixed_dtype(self): result = g[['v1','v2']].mean() assert_frame_equal(result,expected) + + def test_groupby_dtype_inference_empty(self): + # GH 6733 + df = DataFrame({'x': [], 'range': np.arange(0)}) + result = df.groupby('x').first() + expected = DataFrame({'range' : Series([],index=Index([],name='x'),dtype='int64') }) + assert_frame_equal(result,expected,by_blocks=True) + def test_groupby_list_infer_array_like(self): result = self.df.groupby(list(self.df['A'])).mean() expected = self.df.groupby(self.df['A']).mean() @@ -3862,20 +3870,20 @@ def test_lexsort_indexer(self): result = _lexsort_indexer(keys, orders=True, na_position='last') expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) assert_equal(result, expected) - + # orders=True, na_position='first' result = _lexsort_indexer(keys, orders=True, na_position='first') expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) assert_equal(result, expected) - + # orders=False, na_position='last' result = _lexsort_indexer(keys, orders=False, na_position='last') - expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) assert_equal(result, expected) - + # orders=False, na_position='first' result = _lexsort_indexer(keys, orders=False, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) assert_equal(result, expected) def test_nargsort(self): @@ -3887,7 +3895,7 @@ def test_nargsort(self): try: # GH 2785; due to a regression in NumPy1.6.2 np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) - np.argsort(items2, kind='mergesort') + np.argsort(items2, kind='mergesort') except TypeError as err: raise nose.SkipTest('requested sort not available for type') @@ -3898,7 +3906,7 @@ def test_nargsort(self): # because quick and merge sort fall over to insertion sort for small # arrays.""" - + # mergesort, ascending=True, na_position='last' result = _nargsort( items, kind='mergesort', ascending=True, na_position='last')