From 6a10fa824944d4e067b7c13ce984470ba5cf1f77 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Thu, 11 Jan 2018 09:45:10 +0700 Subject: [PATCH 1/9] BUG: assign doesnt cast SparseDataFrame to DataFrame The problem here is that a SparseDataFrame that calls assign should cast to a DataFrame mainly because SparseDataFrames are a special case. --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/frame.py | 4 +++- pandas/tests/frame/test_mutate_columns.py | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 92eeed89ada2a..451135a539e2a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -448,6 +448,7 @@ Reshaping - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`) - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) +- Bug in :func:`DataFrame.assign` which doesn't cast ``SparseDataFrame`` as ``DataFrame``. (:issue:`19163`) Numeric ^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a8c4053850548..326a551cfee24 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2713,7 +2713,9 @@ def assign(self, **kwargs): 8 9 0.549296 2.197225 9 10 -0.758542 2.302585 """ - data = self.copy() + + # See GH19163 + data = self.copy().to_dense() # do all calculations first... results = OrderedDict() diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 9acdf2f17d86a..f1d0284b5eeeb 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -55,6 +55,13 @@ def test_assign(self): result = df.assign(A=lambda x: x.A + x.B) assert_frame_equal(result, expected) + # SparseDataFrame + # See GH 19163 + result = df.to_sparse(fill_value=False).assign(newcol=False) + expected = df.assign(newcol=False) + assert type(result) is DataFrame + assert_frame_equal(expected, result) + def test_assign_multiple(self): df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B']) result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) From 686ef8e8120dd0fd52bd31b3ed65ecbee1ba1b43 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Fri, 19 Jan 2018 16:09:01 +0700 Subject: [PATCH 2/9] BUG: Fixes problem with SparseArray coercing to float if index is passed --- pandas/core/frame.py | 5 ++--- pandas/core/sparse/array.py | 4 ++-- pandas/tests/frame/test_mutate_columns.py | 11 ++--------- pandas/tests/sparse/frame/test_frame.py | 12 ++++++++++++ pandas/tests/sparse/test_array.py | 11 +++++++++++ 5 files changed, 29 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 326a551cfee24..727666a0f5112 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2649,7 +2649,7 @@ def insert(self, loc, column, value, allow_duplicates=False): allow_duplicates=allow_duplicates) def assign(self, **kwargs): - r""" + """ Assign new columns to a DataFrame, returning a new object (a copy) with all the original columns in addition to the new ones. @@ -2714,8 +2714,7 @@ def assign(self, **kwargs): 9 10 -0.758542 2.302585 """ - # See GH19163 - data = self.copy().to_dense() + data = self.copy() # do all calculations first... results = OrderedDict() diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 9b2650359bf68..31dbedcf9218d 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -27,7 +27,7 @@ is_scalar, is_dtype_equal) from pandas.core.dtypes.cast import ( maybe_convert_platform, maybe_promote, - astype_nansafe, find_common_type) + astype_nansafe, find_common_type, infer_dtype_from) from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype import pandas._libs.sparse as splib @@ -195,7 +195,7 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', data = np.nan if not is_scalar(data): raise Exception("must only pass scalars with an index ") - values = np.empty(len(index), dtype='float64') + values = np.empty(len(index), dtype=infer_dtype_from(data)[0]) values.fill(data) data = values diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index f1d0284b5eeeb..25173cf04d9e4 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -6,9 +6,9 @@ import numpy as np from pandas.compat import PY36 -from pandas import DataFrame, Series, Index, MultiIndex +from pandas import DataFrame, Series, Index, MultiIndex, SparseSeries -from pandas.util.testing import assert_frame_equal +from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm @@ -55,13 +55,6 @@ def test_assign(self): result = df.assign(A=lambda x: x.A + x.B) assert_frame_equal(result, expected) - # SparseDataFrame - # See GH 19163 - result = df.to_sparse(fill_value=False).assign(newcol=False) - expected = df.assign(newcol=False) - assert type(result) is DataFrame - assert_frame_equal(expected, result) - def test_assign_multiple(self): df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B']) result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 058892e3b85ff..1454c7d1351cf 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -1271,3 +1271,15 @@ def test_quantile_multi(self): tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) + + def test_assign_with_sparse_frame(self): + # GH 19163 + df = pd.DataFrame({"a":[1,2,3]}) + res = df.to_sparse(fill_value=False).assign(newcol=False) + exp = df.assign(newcol=False).to_sparse(fill_value=False) + + tm.assert_sp_frame_equal(res, exp) + + for column in res.columns: + assert type(res[column]) is SparseSeries + diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 8de93ff320961..c8d2b939c0d34 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -113,6 +113,17 @@ def test_constructor_spindex_dtype(self): assert arr.dtype == np.int64 assert arr.fill_value == 0 + @pytest.mark.parametrize('scalar,dtype', [(False, bool), (0.0, 'float64'), (1, 'int64'), ('z', 'object')]) + def test_scalar_with_index_infer_dtype(self, scalar, dtype): + # GH 19163 + arr = SparseArray(scalar, index=[1,2,3], fill_value=scalar) + exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) + + tm.assert_sp_array_equal(arr, exp) + + assert arr.dtype == dtype + assert exp.dtype == dtype + def test_sparseseries_roundtrip(self): # GH 13999 for kind in ['integer', 'block']: From ac6213af27c87609fd27151763adfc957908de5a Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Fri, 19 Jan 2018 16:14:06 +0700 Subject: [PATCH 3/9] Cleanup --- pandas/core/frame.py | 2 +- pandas/tests/frame/test_mutate_columns.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 202a1e7765b57..47b3292e49dfd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2650,7 +2650,7 @@ def insert(self, loc, column, value, allow_duplicates=False): allow_duplicates=allow_duplicates) def assign(self, **kwargs): - """ + r""" Assign new columns to a DataFrame, returning a new object (a copy) with all the original columns in addition to the new ones. diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 25173cf04d9e4..9acdf2f17d86a 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -6,9 +6,9 @@ import numpy as np from pandas.compat import PY36 -from pandas import DataFrame, Series, Index, MultiIndex, SparseSeries +from pandas import DataFrame, Series, Index, MultiIndex -from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.util.testing import assert_frame_equal import pandas.util.testing as tm From 30425685fdeade91e5e620353ff3ee749fcba14b Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Fri, 19 Jan 2018 16:16:35 +0700 Subject: [PATCH 4/9] More cleanup --- pandas/tests/sparse/frame/test_frame.py | 3 +-- pandas/tests/sparse/test_array.py | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 1454c7d1351cf..91c3817478782 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -1274,7 +1274,7 @@ def test_quantile_multi(self): def test_assign_with_sparse_frame(self): # GH 19163 - df = pd.DataFrame({"a":[1,2,3]}) + df = pd.DataFrame({"a": [1, 2, 3]}) res = df.to_sparse(fill_value=False).assign(newcol=False) exp = df.assign(newcol=False).to_sparse(fill_value=False) @@ -1282,4 +1282,3 @@ def test_assign_with_sparse_frame(self): for column in res.columns: assert type(res[column]) is SparseSeries - diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index c8d2b939c0d34..6c0c83cf65ff7 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -113,10 +113,14 @@ def test_constructor_spindex_dtype(self): assert arr.dtype == np.int64 assert arr.fill_value == 0 - @pytest.mark.parametrize('scalar,dtype', [(False, bool), (0.0, 'float64'), (1, 'int64'), ('z', 'object')]) + @pytest.mark.parametrize('scalar,dtype', [ + (False, bool), + (0.0, 'float64'), + (1, 'int64'), + ('z', 'object')]) def test_scalar_with_index_infer_dtype(self, scalar, dtype): # GH 19163 - arr = SparseArray(scalar, index=[1,2,3], fill_value=scalar) + arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) tm.assert_sp_array_equal(arr, exp) From d900e11f376f904ecda59de575552d48b60f8485 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Fri, 19 Jan 2018 16:17:41 +0700 Subject: [PATCH 5/9] More cleanup --- doc/source/whatsnew/v0.23.0.txt | 2 -- pandas/core/frame.py | 1 - 2 files changed, 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 80020742d1f44..f47b4c8412a0f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -509,8 +509,6 @@ Reshaping - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) ->>>>>>> upstream/master - Numeric ^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 47b3292e49dfd..f0919871218f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2714,7 +2714,6 @@ def assign(self, **kwargs): 8 9 0.549296 2.197225 9 10 -0.758542 2.302585 """ - data = self.copy() # do all calculations first... From a17a5931b263b0e08a0e3e4e706443816c08d757 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Sat, 20 Jan 2018 07:27:24 +0700 Subject: [PATCH 6/9] Comments from PR Updates --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/sparse/array.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f47b4c8412a0f..3830604ce5f8e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -491,7 +491,7 @@ Groupby/Resample/Rolling Sparse ^^^^^^ -- Bug in :class:`SparseArray` where if a scalar and index are passed in it will coerce to float64 regardless of scalar's dtype. (:issue:`19163`) +- Bug in constructing a :class:`SparseArray`: if `data` is a scalar and `index` is defined it will coerce to float64 regardless of scalar's dtype. (:issue:`19163`) - - diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 31dbedcf9218d..7fdf51d0e07aa 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -27,7 +27,7 @@ is_scalar, is_dtype_equal) from pandas.core.dtypes.cast import ( maybe_convert_platform, maybe_promote, - astype_nansafe, find_common_type, infer_dtype_from) + astype_nansafe, find_common_type, infer_dtype_from_scalar) from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype import pandas._libs.sparse as splib @@ -195,7 +195,7 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', data = np.nan if not is_scalar(data): raise Exception("must only pass scalars with an index ") - values = np.empty(len(index), dtype=infer_dtype_from(data)[0]) + values = np.empty(len(index), dtype=infer_dtype_from_scalar(data)[0]) values.fill(data) data = values From 559434a5d0e4560f39de51d2c48f7929b0bd44b5 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 10:12:03 +0700 Subject: [PATCH 7/9] Fix linting error --- pandas/core/sparse/array.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 0ad4a3dcceec7..0134c27209004 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -161,7 +161,8 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', data = np.nan if not is_scalar(data): raise Exception("must only pass scalars with an index ") - values = np.empty(len(index), dtype=infer_dtype_from_scalar(data)[0]) + values = np.empty(len(index), + dtype=infer_dtype_from_scalar(data)[0]) values.fill(data) data = values From 16a272d4a59ca998edd7c6243bbd364e7f6d49aa Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 19:23:04 +0700 Subject: [PATCH 8/9] Update whatsnew entry and use cast function --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/sparse/array.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7f4e75bd90707..09d7acda59021 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -555,7 +555,7 @@ Sparse - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) -- Bug in constructing a :class:`SparseArray`: if `data` is a scalar and `index` is defined it will coerce to float64 regardless of scalar's dtype. (:issue:`19163`) +- Bug in constructing a ``SparseArray``: if ``data`` is a scalar and ``index`` is defined it will coerce to float64 regardless of scalar's dtype. (:issue:`19163`) - Reshaping diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 0134c27209004..e3b047c34400f 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -26,7 +26,8 @@ is_scalar, is_dtype_equal) from pandas.core.dtypes.cast import ( maybe_convert_platform, maybe_promote, - astype_nansafe, find_common_type, infer_dtype_from_scalar) + astype_nansafe, find_common_type, infer_dtype_from_scalar, + construct_1d_arraylike_from_scalar) from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype import pandas._libs.sparse as splib @@ -161,10 +162,9 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', data = np.nan if not is_scalar(data): raise Exception("must only pass scalars with an index ") - values = np.empty(len(index), - dtype=infer_dtype_from_scalar(data)[0]) - values.fill(data) - data = values + dtype = infer_dtype_from_scalar(data)[0] + data = construct_1d_arraylike_from_scalar(data, len(index), + dtype) if isinstance(data, ABCSparseSeries): data = data.values From a81796a2d2d5fcc03e6bf3807c73164f2ed5a97a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 12 Feb 2018 06:39:26 -0500 Subject: [PATCH 9/9] clean --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/sparse/array.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 4485e704f5bd1..72f63a4da0f4d 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -822,7 +822,7 @@ Sparse - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) - Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) -- Bug in constructing a ``SparseArray``: if ``data`` is a scalar and ``index`` is defined it will coerce to float64 regardless of scalar's dtype. (:issue:`19163`) +- Bug in constructing a ``SparseArray``: if ``data`` is a scalar and ``index`` is defined it will coerce to ``float64`` regardless of scalar's dtype. (:issue:`19163`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index ab0689fbb8fc2..3cbae717d0e07 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -164,8 +164,8 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', if not is_scalar(data): raise Exception("must only pass scalars with an index ") dtype = infer_dtype_from_scalar(data)[0] - data = construct_1d_arraylike_from_scalar(data, len(index), - dtype) + data = construct_1d_arraylike_from_scalar( + data, len(index), dtype) if isinstance(data, ABCSparseSeries): data = data.values