From cac51c6f230ebe1a7f1b5f4285c5a804b6113e4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sat, 18 Mar 2017 06:01:04 +0100 Subject: [PATCH 01/27] add tests that succeed with current pandas version but show the inconsistency on values --- pandas/tests/frame/test_dtypes.py | 51 +++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index f7d2c1a654cd5..5b6fcf6f8c7ef 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -622,3 +622,54 @@ def test_astype_str(self): 'NaT NaT' in result) self.assertTrue('2 2013-01-03 2013-01-03 00:00:00-05:00 ' '2013-01-03 00:00:00+01:00' in result) + + def test_values_is_ndarray_with_datetime64tz(self): + df = DataFrame({'A': date_range('20130101', periods=3), + 'B': date_range('20130101', periods=3, tz='US/Eastern'), + }) + + for col in [ + ["A"], + ["A", "A"], + ["A", "B"], + ["B"], + ["B", "B"], + ]: + if col==["B"]: + self.assertTrue(type(df[col].values)==pd.DatetimeIndex) + else: + self.assertTrue(type(df[col].values)==np.ndarray) + + + def test_values_dtypes_with_datetime64tz(self): + df = DataFrame({'A': date_range('20130101', periods=3), + 'B': date_range('20130101', periods=3, tz='US/Eastern'), + }) + + for col in [ + ["A"], + ["A", "A"], + ["A", "B"], + ["B"], + ["B", "B"], + ]: + + df_sub = df[col] + arr = df_sub.values + + # array has the same dtype as dataframe only and only if + # - all columns are of type datetime64[ns] + + # TODO: replace 2nd condition by 'all columns are of type datetime64[ns,same timezone]' + # - all columns are of the same dtype being one of type datetime64[ns] + # or datetime64[ns,tz] with the same tz + if all(df_sub.dtypes.values == " Date: Sat, 18 Mar 2017 06:12:41 +0100 Subject: [PATCH 02/27] BUG: fix df.values when df is of type datetime64[ns,tz] to return ndarray instead of DateTimeIndex --- pandas/tests/frame/test_dtypes.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 5b6fcf6f8c7ef..db2bb5681d1d7 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -635,10 +635,7 @@ def test_values_is_ndarray_with_datetime64tz(self): ["B"], ["B", "B"], ]: - if col==["B"]: - self.assertTrue(type(df[col].values)==pd.DatetimeIndex) - else: - self.assertTrue(type(df[col].values)==np.ndarray) + self.assertTrue(type(df[col].values)==np.ndarray) def test_values_dtypes_with_datetime64tz(self): @@ -659,7 +656,6 @@ def test_values_dtypes_with_datetime64tz(self): # array has the same dtype as dataframe only and only if # - all columns are of type datetime64[ns] - # TODO: replace 2nd condition by 'all columns are of type datetime64[ns,same timezone]' # - all columns are of the same dtype being one of type datetime64[ns] # or datetime64[ns,tz] with the same tz @@ -668,8 +664,4 @@ def test_values_dtypes_with_datetime64tz(self): # otherwise, dtype is object else: - if col==["B"]: - self.assertTrue(type(arr)==pd.DatetimeIndex) - self.assertTrue(isinstance(arr.dtype, DatetimeTZDtype)) - else: - self.assertTrue(arr.dtype == object) \ No newline at end of file + self.assertTrue(arr.dtype == object) \ No newline at end of file From 8b46fada18220825ba7100cce5d2d1e5ecba2ba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sat, 18 Mar 2017 10:25:51 +0100 Subject: [PATCH 03/27] fix small bugs: - _where was not special casing pd.Panels - np.putmask was used instead of rs.mask --- pandas/core/generic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1db9677659ca3..c794d3d461257 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4850,7 +4850,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, msg = "Boolean array expected for the condition, not {dtype}" - if not isinstance(cond, pd.DataFrame): + if not isinstance(cond, (pd.DataFrame, pd.Panel)): # This is a single-dimensional object. if not is_bool_dtype(cond): raise ValueError(msg.format(dtype=cond.dtype)) @@ -5098,6 +5098,7 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): other = com._apply_if_callable(other, self) + return self._where(cond, other, inplace, axis, level, try_cast, raise_on_error) @@ -5783,7 +5784,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs)) - 1) if freq is None: mask = isnull(_values_from_object(self)) - np.putmask(rs.values, mask, np.nan) + rs.mask(mask, np.nan,inplace=True) return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): From 78f931d9fc61c4b56f66544a2e3b51f8773d39c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sat, 18 Mar 2017 10:26:27 +0100 Subject: [PATCH 04/27] BUG: fix interleave and as_matrix by reworking logic --- pandas/core/internals.py | 63 +++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 0e6c176d950a1..69d2115446be2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -66,6 +66,7 @@ from pandas.compat import range, map, zip, u + class Block(PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas @@ -3424,10 +3425,7 @@ def as_matrix(self, items=None): else: mgr = self - if self._is_single_block or not self.is_mixed_type: - return mgr.blocks[0].get_values() - else: - return mgr._interleave() + return mgr._interleave() def _interleave(self): """ @@ -3436,6 +3434,11 @@ def _interleave(self): """ dtype = _interleaved_dtype(self.blocks) + # TODO: add shortcut to avoid copy + if self._is_single_block and dtype != np.object: + return np.array(self.blocks[0].get_values(), + dtype=dtype, copy=False) + result = np.empty(self.shape, dtype=dtype) if result.shape[0] == 0: @@ -4485,33 +4488,65 @@ def _interleaved_dtype(blocks): for x in blocks: counts[type(x)].append(x) - have_int = len(counts[IntBlock]) > 0 have_bool = len(counts[BoolBlock]) > 0 have_object = len(counts[ObjectBlock]) > 0 + have_int = len(counts[IntBlock]) > 0 have_float = len(counts[FloatBlock]) > 0 have_complex = len(counts[ComplexBlock]) > 0 have_dt64 = len(counts[DatetimeBlock]) > 0 have_dt64_tz = len(counts[DatetimeTZBlock]) > 0 have_td64 = len(counts[TimeDeltaBlock]) > 0 - have_cat = len(counts[CategoricalBlock]) > 0 + have_cat = len(counts[CategoricalBlock]) # TODO: have_sparse is not used have_sparse = len(counts[SparseBlock]) > 0 # noqa - have_numeric = have_float or have_complex or have_int - has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat + have_numeric = have_float + have_complex + have_int + have_non_numeric = have_dt64 + have_dt64_tz + have_td64 + have_cat + have_mixed = have_numeric + have_non_numeric + + # print(have_cat, blocks, blocks[0].dtype) + # if have_cat: + # print(blocks[0].get_values().dtype) if (have_object or - (have_bool and - (have_numeric or have_dt64 or have_dt64_tz or have_td64)) or - (have_numeric and has_non_numeric) or have_cat or have_dt64 or - have_dt64_tz or have_td64): + (have_bool and have_mixed) or # bool and something else + (have_non_numeric > 1) or # more than one type of non numeric + (have_non_numeric and have_numeric) or # mix of a numeric et non numeric + have_cat>1 or have_dt64_tz): return np.dtype(object) elif have_bool: return np.dtype(bool) + elif have_dt64: + return np.dtype("datetime64[ns]") + elif have_td64: + return np.dtype("timedelta64[ns]") + elif have_cat: + # return blocks[0].get_values().dtype + # if we are mixing unsigned and signed, then return + # the next biggest int type (if we can) + + dts = [b.get_values().dtype for b in counts[CategoricalBlock]] + lcd = _find_common_type(dts) + kinds = set([_dt.kind for _dt in dts]) + + if len(kinds) == 1: + return lcd + + if lcd == 'uint64' or lcd == 'int64': + return np.dtype('int64') + + # return 1 bigger on the itemsize if unsinged + if lcd.kind == 'u': + return np.dtype('int%s' % (lcd.itemsize * 8 * 2)) + return lcd + elif have_int and not have_float and not have_complex: # if we are mixing unsigned and signed, then return # the next biggest int type (if we can) - lcd = _find_common_type([b.dtype for b in counts[IntBlock]]) - kinds = set([i.dtype.kind for i in counts[IntBlock]]) + + dts = [b.dtype for b in counts[IntBlock]] + lcd = _find_common_type(dts) + kinds = set([_dt.kind for _dt in dts]) + if len(kinds) == 1: return lcd From 6fa742696beee6638bf19a2a8c52ae5534bfb8cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sat, 18 Mar 2017 10:27:11 +0100 Subject: [PATCH 05/27] add test for issue #14052 --- pandas/tests/frame/test_dtypes.py | 70 ++++++++++++++++++------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index db2bb5681d1d7..ebe16d789580c 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1,23 +1,25 @@ # -*- coding: utf-8 -*- from __future__ import print_function + +import itertools from datetime import timedelta import numpy as np + +import pandas as pd +import pandas.util.testing as tm from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, compat, concat, option_context) from pandas.compat import u -from pandas.types.dtypes import DatetimeTZDtype from pandas.tests.frame.common import TestData +from pandas.types.dtypes import DatetimeTZDtype from pandas.util.testing import (assert_series_equal, assert_frame_equal, makeCustomDataframe as mkdf) -import pandas.util.testing as tm -import pandas as pd class TestDataFrameDataTypes(tm.TestCase, TestData): - def test_concat_empty_dataframe_dtypes(self): df = DataFrame(columns=list("abc")) df['a'] = df['a'].astype(np.bool_) @@ -198,7 +200,7 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): def test_select_dtypes_empty(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) with tm.assertRaisesRegexp(ValueError, 'at least one of include or ' - 'exclude must be nonempty'): + 'exclude must be nonempty'): df.select_dtypes() def test_select_dtypes_raises_on_string(self): @@ -536,7 +538,6 @@ def test_arg_for_errors_in_astype(self): class TestDataFrameDatetimeWithTZ(tm.TestCase, TestData): - def test_interleave(self): # interleave with object @@ -635,33 +636,44 @@ def test_values_is_ndarray_with_datetime64tz(self): ["B"], ["B", "B"], ]: - self.assertTrue(type(df[col].values)==np.ndarray) - + self.assertTrue(type(df[col].values) == np.ndarray) def test_values_dtypes_with_datetime64tz(self): - df = DataFrame({'A': date_range('20130101', periods=3), - 'B': date_range('20130101', periods=3, tz='US/Eastern'), + df = DataFrame({'dt': date_range('20130101', periods=3), + 'dttz': date_range('20130101', periods=3, tz='US/Eastern'), + 'td': date_range('20130102', periods=3) + - date_range('20130101', periods=3), + 'cat': pd.Categorical(['a', 'b', 'b']), + 'b': [True, False, False], + 'i': [1, 2, 3], + 'f': [1.3, 2, 3], + 'c': [1j, 2, 3], }) - for col in [ - ["A"], - ["A", "A"], - ["A", "B"], - ["B"], - ["B", "B"], - ]: + cols = itertools.chain(itertools.combinations_with_replacement(df.columns, 1), + itertools.combinations_with_replacement(df.columns, 2)) + for col in cols: - df_sub = df[col] + df_sub = df[list(col)] arr = df_sub.values - - # array has the same dtype as dataframe only and only if - # - all columns are of type datetime64[ns] - # TODO: replace 2nd condition by 'all columns are of type datetime64[ns,same timezone]' - # - all columns are of the same dtype being one of type datetime64[ns] - # or datetime64[ns,tz] with the same tz - if all(df_sub.dtypes.values == " Date: Sat, 18 Mar 2017 11:53:03 +0100 Subject: [PATCH 06/27] BUG: fix interleave and as_matrix by reworking logic --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 69d2115446be2..631aea872bdf2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3435,7 +3435,7 @@ def _interleave(self): dtype = _interleaved_dtype(self.blocks) # TODO: add shortcut to avoid copy - if self._is_single_block and dtype != np.object: + if self._is_single_block: # and dtype != np.object: return np.array(self.blocks[0].get_values(), dtype=dtype, copy=False) From 415d120d4efa624c9fbaa633d4e767b62a423b99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 08:25:57 +0100 Subject: [PATCH 07/27] return datetime64[ns] for values for any block with only datetime64 derived dtypes --- pandas/core/internals.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 631aea872bdf2..68f096e9b18ac 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -66,7 +66,6 @@ from pandas.compat import range, map, zip, u - class Block(PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas @@ -3434,11 +3433,11 @@ def _interleave(self): """ dtype = _interleaved_dtype(self.blocks) - # TODO: add shortcut to avoid copy - if self._is_single_block: # and dtype != np.object: + # print(dtype, self._is_single_block , not self.is_mixed_type, self.blocks) + + if self._is_single_block or not self.is_mixed_type: # and dtype != np.object: return np.array(self.blocks[0].get_values(), dtype=dtype, copy=False) - result = np.empty(self.shape, dtype=dtype) if result.shape[0] == 0: @@ -4500,25 +4499,23 @@ def _interleaved_dtype(blocks): # TODO: have_sparse is not used have_sparse = len(counts[SparseBlock]) > 0 # noqa have_numeric = have_float + have_complex + have_int + have_dt = have_dt64 + have_dt64_tz have_non_numeric = have_dt64 + have_dt64_tz + have_td64 + have_cat - have_mixed = have_numeric + have_non_numeric - - # print(have_cat, blocks, blocks[0].dtype) - # if have_cat: - # print(blocks[0].get_values().dtype) + have_non_dt = have_td64 + have_cat + have_mixed = bool(have_numeric) + bool(have_non_dt) + bool(have_dt) if (have_object or - (have_bool and have_mixed) or # bool and something else (have_non_numeric > 1) or # more than one type of non numeric - (have_non_numeric and have_numeric) or # mix of a numeric et non numeric - have_cat>1 or have_dt64_tz): + (have_bool and have_mixed) or # mix of a numeric et non numeric + (have_mixed>1) or # mix of a numeric et non numeric + (have_cat>1)): return np.dtype(object) - elif have_bool: - return np.dtype(bool) - elif have_dt64: + elif have_dt: return np.dtype("datetime64[ns]") elif have_td64: return np.dtype("timedelta64[ns]") + elif have_bool: + return np.dtype("bool") elif have_cat: # return blocks[0].get_values().dtype # if we are mixing unsigned and signed, then return @@ -4563,8 +4560,16 @@ def _interleaved_dtype(blocks): elif have_complex: return np.dtype('c16') else: + + introspection_blks = counts[FloatBlock] + counts[SparseBlock] - return _find_common_type([b.dtype for b in introspection_blks]) + try: + return _find_common_type([b.dtype for b in introspection_blks]) + except ValueError: + print([(k,v) for k,v in locals().items() if k.startswith("have_") and v]) + print(blocks) + + raise def _consolidate(blocks): From b535cf4cbcec55f9128163d697add24046b07a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 08:26:29 +0100 Subject: [PATCH 08/27] fix test on dtype of values for datetim64 with 2 different tz --- pandas/tests/test_internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index df5e843097514..de90e74c7fc90 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -532,7 +532,7 @@ def test_as_matrix_datetime_tz(self): mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]') self.assertEqual(mgr.get('h').dtype, 'datetime64[ns, US/Eastern]') self.assertEqual(mgr.get('g').dtype, 'datetime64[ns, CET]') - self.assertEqual(mgr.as_matrix().dtype, 'object') + self.assertEqual(mgr.as_matrix().dtype, 'M8[ns]') def test_astype(self): # coerce all From 59e4ca584904183a52030bf70a576aaea05563c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 08:34:57 +0100 Subject: [PATCH 09/27] remove test on dframes as buggy (just to try CI) --- pandas/tests/io/json/test_pandas.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7dbcf25c60b45..2c4ed9c716fe1 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -938,11 +938,12 @@ def test_tz_range_is_utc(self): '"1":"2013-01-02T05:00:00.000Z"}}') tz_range = pd.date_range('2013-01-01 05:00:00Z', periods=2) + self.assertEqual(exp, dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - self.assertEqual(dfexp, dumps(df, iso_dates=True)) + # self.assertEqual(dfexp, dumps(df, iso_dates=True)) tz_range = pd.date_range('2013-01-01 00:00:00', periods=2, tz='US/Eastern') @@ -950,14 +951,14 @@ def test_tz_range_is_utc(self): dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - self.assertEqual(dfexp, dumps(df, iso_dates=True)) + # self.assertEqual(dfexp, dumps(df, iso_dates=True)) tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2) self.assertEqual(exp, dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - self.assertEqual(dfexp, dumps(df, iso_dates=True)) + # self.assertEqual(dfexp, dumps(df, iso_dates=True)) def test_read_jsonl(self): # GH9180 From 59c63583c50df163cf7819f462e0e9b7493e79cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 19:00:48 +0100 Subject: [PATCH 10/27] use dtype in get_values for special case --- pandas/core/internals.py | 12 +++++------- pandas/tests/frame/test_dtypes.py | 9 +++++++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 68f096e9b18ac..c90b690565ce1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3433,11 +3433,10 @@ def _interleave(self): """ dtype = _interleaved_dtype(self.blocks) - # print(dtype, self._is_single_block , not self.is_mixed_type, self.blocks) - - if self._is_single_block or not self.is_mixed_type: # and dtype != np.object: - return np.array(self.blocks[0].get_values(), + if self._is_single_block or not self.is_mixed_type: + return np.array(self.blocks[0].get_values(dtype), dtype=dtype, copy=False) + result = np.empty(self.shape, dtype=dtype) if result.shape[0] == 0: @@ -4508,9 +4507,10 @@ def _interleaved_dtype(blocks): (have_non_numeric > 1) or # more than one type of non numeric (have_bool and have_mixed) or # mix of a numeric et non numeric (have_mixed>1) or # mix of a numeric et non numeric + have_dt64_tz or (have_cat>1)): return np.dtype(object) - elif have_dt: + elif have_dt64: return np.dtype("datetime64[ns]") elif have_td64: return np.dtype("timedelta64[ns]") @@ -4560,8 +4560,6 @@ def _interleaved_dtype(blocks): elif have_complex: return np.dtype('c16') else: - - introspection_blks = counts[FloatBlock] + counts[SparseBlock] try: return _find_common_type([b.dtype for b in introspection_blks]) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index ebe16d789580c..ed70606d5032a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -633,10 +633,15 @@ def test_values_is_ndarray_with_datetime64tz(self): ["A"], ["A", "A"], ["A", "B"], - ["B"], ["B", "B"], + ["B"], ]: - self.assertTrue(type(df[col].values) == np.ndarray) + arr = df[col].values + fst_elem = arr[0,0] if len(arr.shape)==2 else arr[0] + + self.assertEqual(type(arr), np.ndarray) + self.assertEqual(type(fst_elem), pd.Timestamp if "B" in col else np.datetime64) + def test_values_dtypes_with_datetime64tz(self): df = DataFrame({'dt': date_range('20130101', periods=3), From 46fe072fd5d9ed2f3555aeaa6c7c2e84d1a86665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 20:40:17 +0100 Subject: [PATCH 11/27] reshape values in get_values of DatetimeTZBlock (follow same logic of get_values of father class NonConsolidatableMixIn) --- pandas/core/internals.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c90b690565ce1..aca9f7c061636 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2388,9 +2388,14 @@ def get_values(self, dtype=None): # return object dtype as Timestamps with the zones if is_object_dtype(dtype): f = lambda x: lib.Timestamp(x, tz=self.values.tz) - return lib.map_infer( + print("self.values.shape, self.ndim, self.shape",self.values.shape, self.ndim, self.shape) + values = lib.map_infer( self.values.ravel(), f).reshape(self.values.shape) - return self.values + else: + values = self.values + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values def to_object_block(self, mgr): """ @@ -4202,6 +4207,7 @@ def external_values(self): def internal_values(self): return self._block.internal_values() + def get_values(self): """ return a dense type view """ return np.array(self._block.to_dense(), copy=False) From fd32043d8de6df5306063d1cd83aa24be9f7f79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 20:41:55 +0100 Subject: [PATCH 12/27] reshape values in get_values of DatetimeTZBlock (follow same logic of get_values of father class NonConsolidatableMixIn) --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index aca9f7c061636..a7bd725645db7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2388,11 +2388,11 @@ def get_values(self, dtype=None): # return object dtype as Timestamps with the zones if is_object_dtype(dtype): f = lambda x: lib.Timestamp(x, tz=self.values.tz) - print("self.values.shape, self.ndim, self.shape",self.values.shape, self.ndim, self.shape) values = lib.map_infer( self.values.ravel(), f).reshape(self.values.shape) else: values = self.values + if values.ndim == self.ndim - 1: values = values.reshape((1,) + values.shape) return values From 69cec8e15d9b6668b2b6f2f81ad5a29f2cdf9706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 20:42:23 +0100 Subject: [PATCH 13/27] revert changes --- pandas/tests/frame/test_dtypes.py | 3 ++- pandas/tests/io/json/test_pandas.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index ed70606d5032a..29a39a4af7df4 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -637,7 +637,8 @@ def test_values_is_ndarray_with_datetime64tz(self): ["B"], ]: arr = df[col].values - fst_elem = arr[0,0] if len(arr.shape)==2 else arr[0] + print(col, arr.shape, df[col].blocks) + fst_elem = arr[0,0] self.assertEqual(type(arr), np.ndarray) self.assertEqual(type(fst_elem), pd.Timestamp if "B" in col else np.datetime64) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 2c4ed9c716fe1..594b33000965f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -943,7 +943,7 @@ def test_tz_range_is_utc(self): dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - # self.assertEqual(dfexp, dumps(df, iso_dates=True)) + self.assertEqual(dfexp, dumps(df, iso_dates=True)) tz_range = pd.date_range('2013-01-01 00:00:00', periods=2, tz='US/Eastern') @@ -951,14 +951,14 @@ def test_tz_range_is_utc(self): dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - # self.assertEqual(dfexp, dumps(df, iso_dates=True)) + self.assertEqual(dfexp, dumps(df, iso_dates=True)) tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2) self.assertEqual(exp, dumps(tz_range, iso_dates=True)) dti = pd.DatetimeIndex(tz_range) self.assertEqual(exp, dumps(dti, iso_dates=True)) df = DataFrame({'DT': dti}) - # self.assertEqual(dfexp, dumps(df, iso_dates=True)) + self.assertEqual(dfexp, dumps(df, iso_dates=True)) def test_read_jsonl(self): # GH9180 From e83bc0a44a8ff7a4be07d4f905ae6c018e70c948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 20:42:36 +0100 Subject: [PATCH 14/27] revert changes --- pandas/tests/test_internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index de90e74c7fc90..df5e843097514 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -532,7 +532,7 @@ def test_as_matrix_datetime_tz(self): mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]') self.assertEqual(mgr.get('h').dtype, 'datetime64[ns, US/Eastern]') self.assertEqual(mgr.get('g').dtype, 'datetime64[ns, CET]') - self.assertEqual(mgr.as_matrix().dtype, 'M8[ns]') + self.assertEqual(mgr.as_matrix().dtype, 'object') def test_astype(self): # coerce all From ce9ef4f76c59d8fcdc66c2cfe6280bcb862defce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 20:43:51 +0100 Subject: [PATCH 15/27] clean tests --- pandas/tests/frame/test_dtypes.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 29a39a4af7df4..54daa8cef036f 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -637,7 +637,6 @@ def test_values_is_ndarray_with_datetime64tz(self): ["B"], ]: arr = df[col].values - print(col, arr.shape, df[col].blocks) fst_elem = arr[0,0] self.assertEqual(type(arr), np.ndarray) @@ -669,17 +668,17 @@ def test_values_dtypes_with_datetime64tz(self): if len(set(dts)) == 1: if dts[0] in (" Date: Sun, 19 Mar 2017 21:00:21 +0100 Subject: [PATCH 16/27] update whatsnew --- doc/source/whatsnew/v0.20.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a56212328f5c3..9da2d2367685e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -792,6 +792,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``DataFrame.values`` now returns an ``numpy.ndarray`` of ``pandas.Timestamp`` for tz-aware columns; previously this returned ``pandas.DateTimeIndex`` (:issue:`14052`) - Bug in ``Timestamp.replace`` now raises ``TypeError`` when incorrect argument names are given; previously this raised ``ValueError`` (:issue:`15240`) - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) From 479b16d9a68d5da8ab63b16a2dd4ebfae079335c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 21:07:39 +0100 Subject: [PATCH 17/27] fix for flake8 --- pandas/core/generic.py | 2 +- pandas/core/internals.py | 15 ++++----------- pandas/tests/frame/test_dtypes.py | 28 +++++++++++++++++----------- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c794d3d461257..219150d141e7a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5784,7 +5784,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs)) - 1) if freq is None: mask = isnull(_values_from_object(self)) - rs.mask(mask, np.nan,inplace=True) + rs.mask(mask, np.nan, inplace=True) return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a7bd725645db7..2eabd1b0574c9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4207,7 +4207,6 @@ def external_values(self): def internal_values(self): return self._block.internal_values() - def get_values(self): """ return a dense type view """ return np.array(self._block.to_dense(), copy=False) @@ -4510,11 +4509,11 @@ def _interleaved_dtype(blocks): have_mixed = bool(have_numeric) + bool(have_non_dt) + bool(have_dt) if (have_object or - (have_non_numeric > 1) or # more than one type of non numeric + (have_non_numeric > 1) or # more than one type of non numeric (have_bool and have_mixed) or # mix of a numeric et non numeric - (have_mixed>1) or # mix of a numeric et non numeric + (have_mixed > 1) or # mix of a numeric et non numeric have_dt64_tz or - (have_cat>1)): + (have_cat > 1)): return np.dtype(object) elif have_dt64: return np.dtype("datetime64[ns]") @@ -4567,13 +4566,7 @@ def _interleaved_dtype(blocks): return np.dtype('c16') else: introspection_blks = counts[FloatBlock] + counts[SparseBlock] - try: - return _find_common_type([b.dtype for b in introspection_blks]) - except ValueError: - print([(k,v) for k,v in locals().items() if k.startswith("have_") and v]) - print(blocks) - - raise + return _find_common_type([b.dtype for b in introspection_blks]) def _consolidate(blocks): diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 54daa8cef036f..2ecb04e4b16dc 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -625,9 +625,10 @@ def test_astype_str(self): '2013-01-03 00:00:00+01:00' in result) def test_values_is_ndarray_with_datetime64tz(self): - df = DataFrame({'A': date_range('20130101', periods=3), - 'B': date_range('20130101', periods=3, tz='US/Eastern'), - }) + df = DataFrame({ + 'A': date_range('20130101', periods=3), + 'B': date_range('20130101', periods=3, tz='US/Eastern'), + }) for col in [ ["A"], @@ -637,17 +638,20 @@ def test_values_is_ndarray_with_datetime64tz(self): ["B"], ]: arr = df[col].values - fst_elem = arr[0,0] + fst_elem = arr[0, 0] self.assertEqual(type(arr), np.ndarray) - self.assertEqual(type(fst_elem), pd.Timestamp if "B" in col else np.datetime64) - + if "B" in col: + self.assertEqual(type(fst_elem), pd.Timestamp) + else: + self.assertEqual(type(fst_elem), np.datetime64) def test_values_dtypes_with_datetime64tz(self): df = DataFrame({'dt': date_range('20130101', periods=3), - 'dttz': date_range('20130101', periods=3, tz='US/Eastern'), - 'td': date_range('20130102', periods=3) - - date_range('20130101', periods=3), + 'dttz': date_range('20130101', periods=3, + tz='US/Eastern'), + 'td': (date_range('20130102', periods=3) - + date_range('20130101', periods=3)), 'cat': pd.Categorical(['a', 'b', 'b']), 'b': [True, False, False], 'i': [1, 2, 3], @@ -655,8 +659,10 @@ def test_values_dtypes_with_datetime64tz(self): 'c': [1j, 2, 3], }) - cols = itertools.chain(itertools.combinations_with_replacement(df.columns, 1), - itertools.combinations_with_replacement(df.columns, 2)) + cols = itertools.chain( + itertools.combinations_with_replacement(df.columns, 1), + itertools.combinations_with_replacement(df.columns, 2) + ) for col in cols: df_sub = df[list(col)] From 99e71d40529d395bfb1ae570ebe34bf58ff1f835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Sun, 19 Mar 2017 21:25:21 +0100 Subject: [PATCH 18/27] add control of type of first element of df.values --- pandas/tests/frame/test_dtypes.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 2ecb04e4b16dc..7da269f3a0a2b 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -670,9 +670,16 @@ def test_values_dtypes_with_datetime64tz(self): dts = df_sub.dtypes.values dt = arr.dtype + if dts[0] == "M8[ns]": + self.assertEqual(arr[0, 0], np.datetime64(df_sub.iloc[0, 0])) + elif dts[0] == "m8[ns]": + self.assertEqual(arr[0, 0], np.timedelta64(df_sub.iloc[0, 0])) + else: + self.assertEqual(arr[0, 0], df_sub.iloc[0, 0]) + # all columns of the same type if len(set(dts)) == 1: - if dts[0] in (" Date: Sun, 19 Mar 2017 21:33:38 +0100 Subject: [PATCH 19/27] fix call of reshape on dti, only call on ndarray --- pandas/core/internals.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2eabd1b0574c9..cfa991eefef16 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2390,11 +2390,12 @@ def get_values(self, dtype=None): f = lambda x: lib.Timestamp(x, tz=self.values.tz) values = lib.map_infer( self.values.ravel(), f).reshape(self.values.shape) + + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) else: - values = self.values + return self.values - if values.ndim == self.ndim - 1: - values = values.reshape((1,) + values.shape) return values def to_object_block(self, mgr): From 48e5e7942d4edb962c4fa51d69d070f87f985372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Mon, 20 Mar 2017 06:20:43 +0100 Subject: [PATCH 20/27] use rs[mask] instead of rs.mask(mask,... --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 219150d141e7a..3cdf952d99d3d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5784,7 +5784,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs)) - 1) if freq is None: mask = isnull(_values_from_object(self)) - rs.mask(mask, np.nan, inplace=True) + rs[mask] = np.nan return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): From 767218fbe0af25fddba24794b9fcca71f4942012 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Mon, 20 Mar 2017 06:21:26 +0100 Subject: [PATCH 21/27] use assert_numpy_array_equal when testing result of df.values and make more explicity that we test _interleave_dtype --- pandas/tests/frame/test_dtypes.py | 38 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 7da269f3a0a2b..561c34f77e71a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -638,13 +638,11 @@ def test_values_is_ndarray_with_datetime64tz(self): ["B"], ]: arr = df[col].values - fst_elem = arr[0, 0] + dtype_expected = " Date: Mon, 20 Mar 2017 06:34:17 +0100 Subject: [PATCH 22/27] add test for categorical with integer values --- pandas/tests/frame/test_dtypes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 561c34f77e71a..3d399e7b848bf 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -651,6 +651,7 @@ def test_values_dtypes_with_datetime64tz(self): 'td': (date_range('20130102', periods=3) - date_range('20130101', periods=3)), 'cat': pd.Categorical(['a', 'b', 'b']), + 'cati': pd.Categorical([100, 4, 3]), 'b': [True, False, False], 'i': [1, 2, 3], 'f': [1.3, 2, 3], @@ -675,7 +676,10 @@ def test_values_dtypes_with_datetime64tz(self): bool, complex, int, float): dtype_expected = dts[0] else: - dtype_expected = object + if col == ("cati", ): + dtype_expected = 'int64' + else: + dtype_expected = object # different type of columns else: From 9a463a63c109097a5ed4d2081152ec0868b86a40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Mon, 20 Mar 2017 06:50:56 +0100 Subject: [PATCH 23/27] add comment to clarify difference with Series.values re tz aware columns + fix typo --- pandas/core/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3cdf952d99d3d..6f56e7ff6ecba 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3076,7 +3076,10 @@ def values(self): e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. If dtypes are int32 and uint8, dtype will be upcast to int32. By numpy.find_common_type convention, mixing int64 and uint64 - will result in a flot64 dtype. + will result in a float64 dtype. + + Unlike ``Series.values``, Timezone aware datetime data are + converted to ``pandas.Timestamp`` objects and not to UTC datetime64. """ return self.as_matrix() From 6cd4832c31190dce17e62221fd02225279acc44b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Mon, 20 Mar 2017 07:12:42 +0100 Subject: [PATCH 24/27] revert change from commit 48e5e7942d4edb962c4fa51d69d070f87f985372 as the latter breaks the test test_pct_change --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6f56e7ff6ecba..3defce16fcbfd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5787,7 +5787,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs)) - 1) if freq is None: mask = isnull(_values_from_object(self)) - rs[mask] = np.nan + rs.mask(mask, np.nan, inplace=True) return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): From a72a01aeb8a3c7a83712c85b288ea9171d9e5c27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Mon, 20 Mar 2017 18:26:14 +0100 Subject: [PATCH 25/27] clarify doc following review --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9da2d2367685e..ae0f2181a9850 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -792,7 +792,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in ``DataFrame.values`` now returns an ``numpy.ndarray`` of ``pandas.Timestamp`` for tz-aware columns; previously this returned ``pandas.DateTimeIndex`` (:issue:`14052`) +- Bug in ``DataFrame.values`` now returns object dtyped numpy array of ``Timestamp`` for tz-aware columns; previously this returned ``DateTimeIndex`` (:issue:`14052`) - Bug in ``Timestamp.replace`` now raises ``TypeError`` when incorrect argument names are given; previously this raised ``ValueError`` (:issue:`15240`) - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) From 5158f2b9010ee27a2ae06369312d987c1984a209 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Tue, 21 Mar 2017 04:36:00 +0100 Subject: [PATCH 26/27] use rs.iloc[mask] instead of rs.mask remove bug fix on Panel support --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3defce16fcbfd..f664a190b103c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4853,7 +4853,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, msg = "Boolean array expected for the condition, not {dtype}" - if not isinstance(cond, (pd.DataFrame, pd.Panel)): + if not isinstance(cond, pd.DataFrame): # This is a single-dimensional object. if not is_bool_dtype(cond): raise ValueError(msg.format(dtype=cond.dtype)) @@ -5787,7 +5787,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs)) - 1) if freq is None: mask = isnull(_values_from_object(self)) - rs.mask(mask, np.nan, inplace=True) + rs.iloc[mask] = np.nan return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): From 538af7833120f2a1df09af8a61ec8eb75370e21a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20de=20Menten?= Date: Tue, 21 Mar 2017 05:32:11 +0100 Subject: [PATCH 27/27] simplify docstring --- pandas/core/generic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f664a190b103c..d1de9d0bfa01a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3078,8 +3078,7 @@ def values(self): int32. By numpy.find_common_type convention, mixing int64 and uint64 will result in a float64 dtype. - Unlike ``Series.values``, Timezone aware datetime data are - converted to ``pandas.Timestamp`` objects and not to UTC datetime64. + Unlike ``Series.values``, tz-aware dtypes will be upcasted to object. """ return self.as_matrix()