From 0fd3c66cfbcb14b776cdd1c2bcb977ac0293056d Mon Sep 17 00:00:00 2001 From: nickleus27 Date: Mon, 8 Nov 2021 14:48:45 -0800 Subject: [PATCH 01/53] changed variable hashed to combined_hashed in dtype.py --- pandas/core/dtypes/dtypes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 21675ca0cdc7c..98121bf06b542 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -468,12 +468,10 @@ def _hash_categories(self) -> int: # error: Incompatible types in assignment (expression has type # "List[ndarray]", variable has type "ndarray") cat_array = [cat_array] # type: ignore[assignment] - # error: Incompatible types in assignment (expression has type "ndarray", - # variable has type "int") - hashed = combine_hash_arrays( # type: ignore[assignment] + combined_hashed = combine_hash_arrays( iter(cat_array), num_items=len(cat_array) ) - return np.bitwise_xor.reduce(hashed) + return np.bitwise_xor.reduce(combined_hashed) @classmethod def construct_array_type(cls) -> type_t[Categorical]: From c3e784f9470bfa6d4ea7763b3c099e7c75ef6296 Mon Sep 17 00:00:00 2001 From: nickleus27 Date: Mon, 8 Nov 2021 19:28:16 -0800 Subject: [PATCH 02/53] pre-commit changes --- pandas/core/dtypes/dtypes.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 98121bf06b542..e20670893f71c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -468,9 +468,7 @@ def _hash_categories(self) -> int: # error: Incompatible types in assignment (expression has type # "List[ndarray]", variable has type "ndarray") cat_array = [cat_array] # type: ignore[assignment] - combined_hashed = combine_hash_arrays( - iter(cat_array), num_items=len(cat_array) - ) + combined_hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(combined_hashed) @classmethod From 540481735e5df4b02eef42962aae25a4f6381b42 Mon Sep 17 00:00:00 2001 From: nickleus27 Date: Tue, 9 Nov 2021 13:55:57 -0800 Subject: [PATCH 03/53] changed cat_array assignment to cat_array_list --- pandas/core/dtypes/dtypes.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e20670893f71c..7d84d903c84b0 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -465,9 +465,11 @@ def _hash_categories(self) -> int: [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] ) else: - # error: Incompatible types in assignment (expression has type - # "List[ndarray]", variable has type "ndarray") - cat_array = [cat_array] # type: ignore[assignment] + cat_array_list = [cat_array] + combined_hashed = combine_hash_arrays( + iter(cat_array_list), num_items=len(cat_array_list) + ) + return np.bitwise_xor.reduce(combined_hashed) combined_hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(combined_hashed) From 3cefbcc8b00067d8d9ff2145d7a5d6a76b0c3363 Mon Sep 17 00:00:00 2001 From: nickleus27 Date: Sat, 13 Nov 2021 21:31:00 -0800 Subject: [PATCH 04/53] changed arr to arr_lst --- pandas/core/arrays/categorical.py | 66 ++++++++++++++----------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c7f587b35f557..145ff60a28f46 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,7 +6,6 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, - Any, Hashable, Sequence, TypeVar, @@ -38,10 +37,6 @@ Dtype, NpDtype, Ordered, - PositionalIndexer2D, - PositionalIndexerTuple, - ScalarIndexer, - SequenceIndexer, Shape, npt, type_t, @@ -102,7 +97,10 @@ take_nd, unique1d, ) -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import ( + NDArrayBackedExtensionArray, + ravel_compat, +) from pandas.core.base import ( ExtensionArray, NoNewAttributesMixin, @@ -113,7 +111,6 @@ extract_array, sanitize_array, ) -from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -424,13 +421,8 @@ def __init__( if null_mask.any(): # We remove null values here, then below will re-insert # them, grep "full_codes" - - # error: Incompatible types in assignment (expression has type - # "List[Any]", variable has type "ExtensionArray") - arr = [ # type: ignore[assignment] - values[idx] for idx in np.where(~null_mask)[0] - ] - arr = sanitize_array(arr, None) + arr_lst = [values[idx] for idx in np.where(~null_mask)[0]] + arr = sanitize_array(arr_lst, None) values = arr if dtype.categories is None: @@ -1484,6 +1476,7 @@ def _validate_scalar(self, fill_value): # ------------------------------------------------------------- + @ravel_compat def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """ The numpy array interface. @@ -1934,7 +1927,10 @@ def __iter__(self): """ Returns an Iterator over the values of this Categorical. """ - return iter(self._internal_get_values().tolist()) + if self.ndim == 1: + return iter(self._internal_get_values().tolist()) + else: + return (self[n] for n in range(len(self))) def __contains__(self, key) -> bool: """ @@ -2053,27 +2049,6 @@ def __repr__(self) -> str: # ------------------------------------------------------------------ - @overload - def __getitem__(self, key: ScalarIndexer) -> Any: - ... - - @overload - def __getitem__( - self: CategoricalT, - key: SequenceIndexer | PositionalIndexerTuple, - ) -> CategoricalT: - ... - - def __getitem__(self: CategoricalT, key: PositionalIndexer2D) -> CategoricalT | Any: - """ - Return an item. - """ - result = super().__getitem__(key) - if getattr(result, "ndim", 0) > 1: - result = result._ndarray - deprecate_ndim_indexing(result) - return result - def _validate_listlike(self, value): # NB: here we assume scalar-like tuples have already been excluded value = extract_array(value, extract_numpy=True) @@ -2311,7 +2286,19 @@ def _concat_same_type( ) -> CategoricalT: from pandas.core.dtypes.concat import union_categoricals - return union_categoricals(to_concat) + result = union_categoricals(to_concat) + + # in case we are concatenating along axis != 0, we need to reshape + # the result from union_categoricals + first = to_concat[0] + if axis >= first.ndim: + raise ValueError + if axis == 1: + if not all(len(x) == len(first) for x in to_concat): + raise ValueError + # TODO: Will this get contiguity wrong? + result = result.reshape(-1, len(to_concat), order="F") + return result # ------------------------------------------------------------------ @@ -2699,6 +2686,11 @@ def _get_codes_for_values(values, categories: Index) -> np.ndarray: """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) + if values.ndim > 1: + flat = values.ravel() + codes = _get_codes_for_values(flat, categories) + return codes.reshape(values.shape) + if isinstance(categories.dtype, ExtensionDtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. From 80bd1f1e787becf5b2078550e9953d711586b550 Mon Sep 17 00:00:00 2001 From: nickleus27 Date: Sat, 13 Nov 2021 21:49:29 -0800 Subject: [PATCH 05/53] Revert "changed arr to arr_lst" This reverts commit 3cefbcc8b00067d8d9ff2145d7a5d6a76b0c3363. --- pandas/core/arrays/categorical.py | 66 +++++++++++++++++-------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 145ff60a28f46..c7f587b35f557 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,6 +6,7 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, + Any, Hashable, Sequence, TypeVar, @@ -37,6 +38,10 @@ Dtype, NpDtype, Ordered, + PositionalIndexer2D, + PositionalIndexerTuple, + ScalarIndexer, + SequenceIndexer, Shape, npt, type_t, @@ -97,10 +102,7 @@ take_nd, unique1d, ) -from pandas.core.arrays._mixins import ( - NDArrayBackedExtensionArray, - ravel_compat, -) +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.base import ( ExtensionArray, NoNewAttributesMixin, @@ -111,6 +113,7 @@ extract_array, sanitize_array, ) +from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -421,8 +424,13 @@ def __init__( if null_mask.any(): # We remove null values here, then below will re-insert # them, grep "full_codes" - arr_lst = [values[idx] for idx in np.where(~null_mask)[0]] - arr = sanitize_array(arr_lst, None) + + # error: Incompatible types in assignment (expression has type + # "List[Any]", variable has type "ExtensionArray") + arr = [ # type: ignore[assignment] + values[idx] for idx in np.where(~null_mask)[0] + ] + arr = sanitize_array(arr, None) values = arr if dtype.categories is None: @@ -1476,7 +1484,6 @@ def _validate_scalar(self, fill_value): # ------------------------------------------------------------- - @ravel_compat def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """ The numpy array interface. @@ -1927,10 +1934,7 @@ def __iter__(self): """ Returns an Iterator over the values of this Categorical. """ - if self.ndim == 1: - return iter(self._internal_get_values().tolist()) - else: - return (self[n] for n in range(len(self))) + return iter(self._internal_get_values().tolist()) def __contains__(self, key) -> bool: """ @@ -2049,6 +2053,27 @@ def __repr__(self) -> str: # ------------------------------------------------------------------ + @overload + def __getitem__(self, key: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__( + self: CategoricalT, + key: SequenceIndexer | PositionalIndexerTuple, + ) -> CategoricalT: + ... + + def __getitem__(self: CategoricalT, key: PositionalIndexer2D) -> CategoricalT | Any: + """ + Return an item. + """ + result = super().__getitem__(key) + if getattr(result, "ndim", 0) > 1: + result = result._ndarray + deprecate_ndim_indexing(result) + return result + def _validate_listlike(self, value): # NB: here we assume scalar-like tuples have already been excluded value = extract_array(value, extract_numpy=True) @@ -2286,19 +2311,7 @@ def _concat_same_type( ) -> CategoricalT: from pandas.core.dtypes.concat import union_categoricals - result = union_categoricals(to_concat) - - # in case we are concatenating along axis != 0, we need to reshape - # the result from union_categoricals - first = to_concat[0] - if axis >= first.ndim: - raise ValueError - if axis == 1: - if not all(len(x) == len(first) for x in to_concat): - raise ValueError - # TODO: Will this get contiguity wrong? - result = result.reshape(-1, len(to_concat), order="F") - return result + return union_categoricals(to_concat) # ------------------------------------------------------------------ @@ -2686,11 +2699,6 @@ def _get_codes_for_values(values, categories: Index) -> np.ndarray: """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) - if values.ndim > 1: - flat = values.ravel() - codes = _get_codes_for_values(flat, categories) - return codes.reshape(values.shape) - if isinstance(categories.dtype, ExtensionDtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. From 1601c53f28e2271603f9e5019e0714e6f8c9f1e0 Mon Sep 17 00:00:00 2001 From: nickleus27 Date: Sat, 13 Nov 2021 21:56:54 -0800 Subject: [PATCH 06/53] changed arr to arr_lst --- pandas/core/arrays/categorical.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c7f587b35f557..609543a261a1c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -424,13 +424,8 @@ def __init__( if null_mask.any(): # We remove null values here, then below will re-insert # them, grep "full_codes" - - # error: Incompatible types in assignment (expression has type - # "List[Any]", variable has type "ExtensionArray") - arr = [ # type: ignore[assignment] - values[idx] for idx in np.where(~null_mask)[0] - ] - arr = sanitize_array(arr, None) + arr_lst = [values[idx] for idx in np.where(~null_mask)[0]] + arr = sanitize_array(arr_lst, None) values = arr if dtype.categories is None: From 2338a2246ae9c2cd8ef6671b4f4bc9d8aa9943f8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Nov 2021 17:43:48 -0800 Subject: [PATCH 07/53] CLN: split giant dt accessor tests (#44355) --- .../series/accessors/test_dt_accessor.py | 416 ++++++++++-------- 1 file changed, 225 insertions(+), 191 deletions(-) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index eb7e1d4268605..48a3ebd25c239 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -39,121 +39,136 @@ ) import pandas.core.common as com +ok_for_period = PeriodArray._datetimelike_ops +ok_for_period_methods = ["strftime", "to_timestamp", "asfreq"] +ok_for_dt = DatetimeArray._datetimelike_ops +ok_for_dt_methods = [ + "to_period", + "to_pydatetime", + "tz_localize", + "tz_convert", + "normalize", + "strftime", + "round", + "floor", + "ceil", + "day_name", + "month_name", + "isocalendar", +] +ok_for_td = TimedeltaArray._datetimelike_ops +ok_for_td_methods = [ + "components", + "to_pytimedelta", + "total_seconds", + "round", + "floor", + "ceil", +] + + +def get_dir(ser): + # check limited display api + results = [r for r in ser.dt.__dir__() if not r.startswith("_")] + return sorted(set(results)) -class TestSeriesDatetimeValues: - def test_dt_namespace_accessor(self): +class TestSeriesDatetimeValues: + def _compare(self, ser, name): # GH 7207, 11128 # test .dt namespace accessor - ok_for_period = PeriodArray._datetimelike_ops - ok_for_period_methods = ["strftime", "to_timestamp", "asfreq"] - ok_for_dt = DatetimeArray._datetimelike_ops - ok_for_dt_methods = [ - "to_period", - "to_pydatetime", - "tz_localize", - "tz_convert", - "normalize", - "strftime", - "round", - "floor", - "ceil", - "day_name", - "month_name", - "isocalendar", - ] - ok_for_td = TimedeltaArray._datetimelike_ops - ok_for_td_methods = [ - "components", - "to_pytimedelta", - "total_seconds", - "round", - "floor", - "ceil", - ] - - def get_expected(s, name): - result = getattr(Index(s._values), prop) + def get_expected(ser, prop): + result = getattr(Index(ser._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype("int64") elif not is_list_like(result) or isinstance(result, DataFrame): return result - return Series(result, index=s.index, name=s.name) - - def compare(s, name): - a = getattr(s.dt, prop) - b = get_expected(s, prop) - if not (is_list_like(a) and is_list_like(b)): - assert a == b - elif isinstance(a, DataFrame): - tm.assert_frame_equal(a, b) - else: - tm.assert_series_equal(a, b) + return Series(result, index=ser.index, name=ser.name) + + left = getattr(ser.dt, name) + right = get_expected(ser, name) + if not (is_list_like(left) and is_list_like(right)): + assert left == right + elif isinstance(left, DataFrame): + tm.assert_frame_equal(left, right) + else: + tm.assert_series_equal(left, right) + + @pytest.mark.parametrize("freq", ["D", "s", "ms"]) + def test_dt_namespace_accessor_datetime64(self, freq): + # GH#7207, GH#11128 + # test .dt namespace accessor # datetimeindex - cases = [ - Series(date_range("20130101", periods=5), name="xxx"), - Series(date_range("20130101", periods=5, freq="s"), name="xxx"), - Series(date_range("20130101 00:00:00", periods=5, freq="ms"), name="xxx"), - ] - for s in cases: - for prop in ok_for_dt: - # we test freq below - # we ignore week and weekofyear because they are deprecated - if prop not in ["freq", "week", "weekofyear"]: - compare(s, prop) + dti = date_range("20130101", periods=5, freq=freq) + ser = Series(dti, name="xxx") - for prop in ok_for_dt_methods: - getattr(s.dt, prop) + for prop in ok_for_dt: + # we test freq below + # we ignore week and weekofyear because they are deprecated + if prop not in ["freq", "week", "weekofyear"]: + self._compare(ser, prop) - result = s.dt.to_pydatetime() - assert isinstance(result, np.ndarray) - assert result.dtype == object + for prop in ok_for_dt_methods: + getattr(ser.dt, prop) - result = s.dt.tz_localize("US/Eastern") - exp_values = DatetimeIndex(s.values).tz_localize("US/Eastern") - expected = Series(exp_values, index=s.index, name="xxx") - tm.assert_series_equal(result, expected) + result = ser.dt.to_pydatetime() + assert isinstance(result, np.ndarray) + assert result.dtype == object - tz_result = result.dt.tz - assert str(tz_result) == "US/Eastern" - freq_result = s.dt.freq - assert freq_result == DatetimeIndex(s.values, freq="infer").freq - - # let's localize, then convert - result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") - exp_values = ( - DatetimeIndex(s.values).tz_localize("UTC").tz_convert("US/Eastern") - ) - expected = Series(exp_values, index=s.index, name="xxx") - tm.assert_series_equal(result, expected) + result = ser.dt.tz_localize("US/Eastern") + exp_values = DatetimeIndex(ser.values).tz_localize("US/Eastern") + expected = Series(exp_values, index=ser.index, name="xxx") + tm.assert_series_equal(result, expected) + + tz_result = result.dt.tz + assert str(tz_result) == "US/Eastern" + freq_result = ser.dt.freq + assert freq_result == DatetimeIndex(ser.values, freq="infer").freq + + # let's localize, then convert + result = ser.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") + exp_values = ( + DatetimeIndex(ser.values).tz_localize("UTC").tz_convert("US/Eastern") + ) + expected = Series(exp_values, index=ser.index, name="xxx") + tm.assert_series_equal(result, expected) + + def test_dt_namespace_accessor_datetime64tz(self): + # GH#7207, GH#11128 + # test .dt namespace accessor # datetimeindex with tz - s = Series(date_range("20130101", periods=5, tz="US/Eastern"), name="xxx") + dti = date_range("20130101", periods=5, tz="US/Eastern") + ser = Series(dti, name="xxx") for prop in ok_for_dt: # we test freq below # we ignore week and weekofyear because they are deprecated if prop not in ["freq", "week", "weekofyear"]: - compare(s, prop) + self._compare(ser, prop) for prop in ok_for_dt_methods: - getattr(s.dt, prop) + getattr(ser.dt, prop) - result = s.dt.to_pydatetime() + result = ser.dt.to_pydatetime() assert isinstance(result, np.ndarray) assert result.dtype == object - result = s.dt.tz_convert("CET") - expected = Series(s._values.tz_convert("CET"), index=s.index, name="xxx") + result = ser.dt.tz_convert("CET") + expected = Series(ser._values.tz_convert("CET"), index=ser.index, name="xxx") tm.assert_series_equal(result, expected) tz_result = result.dt.tz assert str(tz_result) == "CET" - freq_result = s.dt.freq - assert freq_result == DatetimeIndex(s.values, freq="infer").freq + freq_result = ser.dt.freq + assert freq_result == DatetimeIndex(ser.values, freq="infer").freq + + def test_dt_namespace_accessor_timedelta(self): + # GH#7207, GH#11128 + # test .dt namespace accessor # timedelta index cases = [ @@ -166,102 +181,115 @@ def compare(s, name): name="xxx", ), ] - for s in cases: + for ser in cases: for prop in ok_for_td: # we test freq below if prop != "freq": - compare(s, prop) + self._compare(ser, prop) for prop in ok_for_td_methods: - getattr(s.dt, prop) + getattr(ser.dt, prop) - result = s.dt.components + result = ser.dt.components assert isinstance(result, DataFrame) - tm.assert_index_equal(result.index, s.index) + tm.assert_index_equal(result.index, ser.index) - result = s.dt.to_pytimedelta() + result = ser.dt.to_pytimedelta() assert isinstance(result, np.ndarray) assert result.dtype == object - result = s.dt.total_seconds() + result = ser.dt.total_seconds() assert isinstance(result, Series) assert result.dtype == "float64" - freq_result = s.dt.freq - assert freq_result == TimedeltaIndex(s.values, freq="infer").freq + freq_result = ser.dt.freq + assert freq_result == TimedeltaIndex(ser.values, freq="infer").freq + + def test_dt_namespace_accessor_period(self): + # GH#7207, GH#11128 + # test .dt namespace accessor + + # periodindex + pi = period_range("20130101", periods=5, freq="D") + ser = Series(pi, name="xxx") + + for prop in ok_for_period: + # we test freq below + if prop != "freq": + self._compare(ser, prop) + + for prop in ok_for_period_methods: + getattr(ser.dt, prop) + + freq_result = ser.dt.freq + assert freq_result == PeriodIndex(ser.values).freq + + def test_dt_namespace_accessor_index_and_values(self): # both index = date_range("20130101", periods=3, freq="D") - s = Series(date_range("20140204", periods=3, freq="s"), index=index, name="xxx") + dti = date_range("20140204", periods=3, freq="s") + ser = Series(dti, index=index, name="xxx") exp = Series( np.array([2014, 2014, 2014], dtype="int64"), index=index, name="xxx" ) - tm.assert_series_equal(s.dt.year, exp) + tm.assert_series_equal(ser.dt.year, exp) exp = Series(np.array([2, 2, 2], dtype="int64"), index=index, name="xxx") - tm.assert_series_equal(s.dt.month, exp) + tm.assert_series_equal(ser.dt.month, exp) exp = Series(np.array([0, 1, 2], dtype="int64"), index=index, name="xxx") - tm.assert_series_equal(s.dt.second, exp) - - exp = Series([s[0]] * 3, index=index, name="xxx") - tm.assert_series_equal(s.dt.normalize(), exp) - - # periodindex - cases = [Series(period_range("20130101", periods=5, freq="D"), name="xxx")] - for s in cases: - for prop in ok_for_period: - # we test freq below - if prop != "freq": - compare(s, prop) - - for prop in ok_for_period_methods: - getattr(s.dt, prop) + tm.assert_series_equal(ser.dt.second, exp) - freq_result = s.dt.freq - assert freq_result == PeriodIndex(s.values).freq + exp = Series([ser[0]] * 3, index=index, name="xxx") + tm.assert_series_equal(ser.dt.normalize(), exp) - # test limited display api - def get_dir(s): - results = [r for r in s.dt.__dir__() if not r.startswith("_")] - return sorted(set(results)) + def test_dt_accessor_limited_display_api(self): + # tznaive + ser = Series(date_range("20130101", periods=5, freq="D"), name="xxx") + results = get_dir(ser) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) - s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") - results = get_dir(s) + # tzaware + ser = Series(date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + ser = ser.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") + results = get_dir(ser) tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) - s = Series( + # Period + ser = Series( period_range("20130101", periods=5, freq="D", name="xxx").astype(object) ) - results = get_dir(s) + results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) ) - # 11295 + def test_dt_accessor_ambiguous_freq_conversions(self): + # GH#11295 # ambiguous time error on the conversions - s = Series(date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") - s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") - results = get_dir(s) - tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) + ser = Series(date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + ser = ser.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") + exp_values = date_range( "2015-01-01", "2016-01-01", freq="T", tz="UTC" ).tz_convert("America/Chicago") # freq not preserved by tz_localize above exp_values = exp_values._with_freq(None) expected = Series(exp_values, name="xxx") - tm.assert_series_equal(s, expected) + tm.assert_series_equal(ser, expected) + def test_dt_accessor_not_writeable(self): # no setting allowed - s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") + ser = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): - s.dt.hour = 5 + ser.dt.hour = 5 # trying to set a copy msg = "modifications to a property of a datetimelike.+not supported" with pd.option_context("chained_assignment", "raise"): with pytest.raises(com.SettingWithCopyError, match=msg): - s.dt.hour[0] = 5 + ser.dt.hour[0] = 5 @pytest.mark.parametrize( "method, dates", @@ -273,24 +301,24 @@ def get_dir(s): ) def test_dt_round(self, method, dates): # round - s = Series( + ser = Series( pd.to_datetime( ["2012-01-01 13:00:00", "2012-01-01 12:01:00", "2012-01-01 08:00:00"] ), name="xxx", ) - result = getattr(s.dt, method)("D") + result = getattr(ser.dt, method)("D") expected = Series(pd.to_datetime(dates), name="xxx") tm.assert_series_equal(result, expected) def test_dt_round_tz(self): - s = Series( + ser = Series( pd.to_datetime( ["2012-01-01 13:00:00", "2012-01-01 12:01:00", "2012-01-01 08:00:00"] ), name="xxx", ) - result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern").dt.round("D") + result = ser.dt.tz_localize("UTC").dt.tz_convert("US/Eastern").dt.round("D") exp_values = pd.to_datetime( ["2012-01-01", "2012-01-01", "2012-01-01"] @@ -339,23 +367,23 @@ def test_dt_round_tz_ambiguous(self, method): ) def test_dt_round_tz_nonexistent(self, method, ts_str, freq): # GH 23324 round near "spring forward" DST - s = Series([pd.Timestamp(ts_str, tz="America/Chicago")]) - result = getattr(s.dt, method)(freq, nonexistent="shift_forward") + ser = Series([pd.Timestamp(ts_str, tz="America/Chicago")]) + result = getattr(ser.dt, method)(freq, nonexistent="shift_forward") expected = Series([pd.Timestamp("2018-03-11 03:00:00", tz="America/Chicago")]) tm.assert_series_equal(result, expected) - result = getattr(s.dt, method)(freq, nonexistent="NaT") + result = getattr(ser.dt, method)(freq, nonexistent="NaT") expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz) tm.assert_series_equal(result, expected) with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): - getattr(s.dt, method)(freq, nonexistent="raise") + getattr(ser.dt, method)(freq, nonexistent="raise") def test_dt_namespace_accessor_categorical(self): # GH 19468 dti = DatetimeIndex(["20171111", "20181212"]).repeat(2) - s = Series(pd.Categorical(dti), name="foo") - result = s.dt.year + ser = Series(pd.Categorical(dti), name="foo") + result = ser.dt.year expected = Series([2017, 2017, 2018, 2018], name="foo") tm.assert_series_equal(result, expected) @@ -394,9 +422,9 @@ def test_dt_other_accessors_categorical(self, accessor): def test_dt_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 - s = Series(date_range("20130101", periods=5, freq="D")) + ser = Series(date_range("20130101", periods=5, freq="D")) with pytest.raises(AttributeError, match="You cannot add any new attribute"): - s.dt.xlabel = "a" + ser.dt.xlabel = "a" @pytest.mark.parametrize( "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() @@ -434,7 +462,7 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] - s = Series(date_range(freq="D", start=datetime(1998, 1, 1), periods=365)) + ser = Series(date_range(freq="D", start=datetime(1998, 1, 1), periods=365)) english_days = [ "Monday", "Tuesday", @@ -446,13 +474,13 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ] for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() - assert s.dt.day_name(locale=time_locale)[day] == name - assert s.dt.day_name(locale=None)[day] == eng_name - s = s.append(Series([pd.NaT])) - assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1]) + assert ser.dt.day_name(locale=time_locale)[day] == name + assert ser.dt.day_name(locale=None)[day] == eng_name + ser = ser.append(Series([pd.NaT])) + assert np.isnan(ser.dt.day_name(locale=time_locale).iloc[-1]) - s = Series(date_range(freq="M", start="2012", end="2013")) - result = s.dt.month_name(locale=time_locale) + ser = Series(date_range(freq="M", start="2012", end="2013")) + result = ser.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) # work around https://github.com/pandas-dev/pandas/issues/22342 @@ -461,7 +489,7 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): tm.assert_series_equal(result, expected) - for s_date, expected in zip(s, expected_months): + for s_date, expected in zip(ser, expected_months): result = s_date.month_name(locale=time_locale) expected = expected.capitalize() @@ -470,20 +498,20 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): assert result == expected - s = s.append(Series([pd.NaT])) - assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1]) + ser = ser.append(Series([pd.NaT])) + assert np.isnan(ser.dt.month_name(locale=time_locale).iloc[-1]) def test_strftime(self): # GH 10086 - s = Series(date_range("20130101", periods=5)) - result = s.dt.strftime("%Y/%m/%d") + ser = Series(date_range("20130101", periods=5)) + result = ser.dt.strftime("%Y/%m/%d") expected = Series( ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) - s = Series(date_range("2015-02-03 11:22:33.4567", periods=5)) - result = s.dt.strftime("%Y/%m/%d %H-%M-%S") + ser = Series(date_range("2015-02-03 11:22:33.4567", periods=5)) + result = ser.dt.strftime("%Y/%m/%d %H-%M-%S") expected = Series( [ "2015/02/03 11-22-33", @@ -495,15 +523,15 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) - s = Series(period_range("20130101", periods=5)) - result = s.dt.strftime("%Y/%m/%d") + ser = Series(period_range("20130101", periods=5)) + result = ser.dt.strftime("%Y/%m/%d") expected = Series( ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) - s = Series(period_range("2015-02-03 11:22:33.4567", periods=5, freq="s")) - result = s.dt.strftime("%Y/%m/%d %H-%M-%S") + ser = Series(period_range("2015-02-03 11:22:33.4567", periods=5, freq="s")) + result = ser.dt.strftime("%Y/%m/%d %H-%M-%S") expected = Series( [ "2015/02/03 11-22-33", @@ -515,9 +543,10 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) - s = Series(date_range("20130101", periods=5)) - s.iloc[0] = pd.NaT - result = s.dt.strftime("%Y/%m/%d") + def test_strftime_dt64_days(self): + ser = Series(date_range("20130101", periods=5)) + ser.iloc[0] = pd.NaT + result = ser.dt.strftime("%Y/%m/%d") expected = Series( [np.nan, "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) @@ -533,6 +562,7 @@ def test_strftime(self): # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) + def test_strftime_period_days(self): period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") expected = Index( @@ -541,13 +571,15 @@ def test_strftime(self): ) tm.assert_index_equal(result, expected) - s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) - result = s.dt.strftime("%Y-%m-%d %H:%M:%S") + def test_strftime_dt64_microsecond_resolution(self): + ser = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) + result = ser.dt.strftime("%Y-%m-%d %H:%M:%S") expected = Series(["2013-01-01 02:32:59", "2013-01-02 14:32:01"]) tm.assert_series_equal(result, expected) - s = Series(period_range("20130101", periods=4, freq="H")) - result = s.dt.strftime("%Y/%m/%d %H:%M:%S") + def test_strftime_period_hours(self): + ser = Series(period_range("20130101", periods=4, freq="H")) + result = ser.dt.strftime("%Y/%m/%d %H:%M:%S") expected = Series( [ "2013/01/01 00:00:00", @@ -556,9 +588,11 @@ def test_strftime(self): "2013/01/01 03:00:00", ] ) + tm.assert_series_equal(result, expected) - s = Series(period_range("20130101", periods=4, freq="L")) - result = s.dt.strftime("%Y/%m/%d %H:%M:%S.%l") + def test_strftime_period_minutes(self): + ser = Series(period_range("20130101", periods=4, freq="L")) + result = ser.dt.strftime("%Y/%m/%d %H:%M:%S.%l") expected = Series( [ "2013/01/01 00:00:00.000", @@ -578,8 +612,8 @@ def test_strftime(self): ) def test_strftime_nat(self, data): # GH 29578 - s = Series(data) - result = s.dt.strftime("%Y-%m-%d") + ser = Series(data) + result = ser.dt.strftime("%Y-%m-%d") expected = Series(["2019-01-01", np.nan]) tm.assert_series_equal(result, expected) @@ -591,16 +625,16 @@ def test_valid_dt_with_missing_values(self): ) # GH 8689 - s = Series(date_range("20130101", periods=5, freq="D")) - s.iloc[2] = pd.NaT + ser = Series(date_range("20130101", periods=5, freq="D")) + ser.iloc[2] = pd.NaT for attr in ["microsecond", "nanosecond", "second", "minute", "hour", "day"]: - expected = getattr(s.dt, attr).copy() + expected = getattr(ser.dt, attr).copy() expected.iloc[2] = np.nan - result = getattr(s.dt, attr) + result = getattr(ser.dt, attr) tm.assert_series_equal(result, expected) - result = s.dt.date + result = ser.dt.date expected = Series( [ date(2013, 1, 1), @@ -613,7 +647,7 @@ def test_valid_dt_with_missing_values(self): ) tm.assert_series_equal(result, expected) - result = s.dt.time + result = ser.dt.time expected = Series([time(0), time(0), np.nan, time(0), time(0)], dtype="object") tm.assert_series_equal(result, expected) @@ -626,8 +660,8 @@ def test_dt_accessor_api(self): assert Series.dt is CombinedDatetimelikeProperties - s = Series(date_range("2000-01-01", periods=3)) - assert isinstance(s.dt, DatetimeProperties) + ser = Series(date_range("2000-01-01", periods=3)) + assert isinstance(ser.dt, DatetimeProperties) @pytest.mark.parametrize( "ser", [Series(np.arange(5)), Series(list("abcde")), Series(np.random.randn(5))] @@ -639,11 +673,11 @@ def test_dt_accessor_invalid(self, ser): assert not hasattr(ser, "dt") def test_dt_accessor_updates_on_inplace(self): - s = Series(date_range("2018-01-01", periods=10)) - s[2] = None - return_value = s.fillna(pd.Timestamp("2018-01-01"), inplace=True) + ser = Series(date_range("2018-01-01", periods=10)) + ser[2] = None + return_value = ser.fillna(pd.Timestamp("2018-01-01"), inplace=True) assert return_value is None - result = s.dt.date + result = ser.dt.date assert result[0] == result[2] def test_date_tz(self): @@ -652,10 +686,10 @@ def test_date_tz(self): ["2014-04-04 23:56", "2014-07-18 21:24", "2015-11-22 22:14"], tz="US/Eastern", ) - s = Series(rng) + ser = Series(rng) expected = Series([date(2014, 4, 4), date(2014, 7, 18), date(2015, 11, 22)]) - tm.assert_series_equal(s.dt.date, expected) - tm.assert_series_equal(s.apply(lambda x: x.date()), expected) + tm.assert_series_equal(ser.dt.date, expected) + tm.assert_series_equal(ser.apply(lambda x: x.date()), expected) def test_dt_timetz_accessor(self, tz_naive_fixture): # GH21358 @@ -664,11 +698,11 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): dtindex = DatetimeIndex( ["2014-04-04 23:56", "2014-07-18 21:24", "2015-11-22 22:14"], tz=tz ) - s = Series(dtindex) + ser = Series(dtindex) expected = Series( [time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz), time(22, 14, tzinfo=tz)] ) - result = s.dt.timetz + result = ser.dt.timetz tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -731,9 +765,9 @@ def test_end_time_timevalues(self, input_vals): # when using the dt accessor on a Series input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) - s = Series(input_vals) - result = s.dt.end_time - expected = s.apply(lambda x: x.end_time) + ser = Series(input_vals) + result = ser.dt.end_time + expected = ser.apply(lambda x: x.end_time) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("input_vals", [("2001"), ("NaT")]) @@ -755,7 +789,7 @@ def test_week_and_weekofyear_are_deprecated(): def test_normalize_pre_epoch_dates(): # GH: 36294 - s = pd.to_datetime(Series(["1969-01-01 09:00:00", "2016-01-01 09:00:00"])) - result = s.dt.normalize() + ser = pd.to_datetime(Series(["1969-01-01 09:00:00", "2016-01-01 09:00:00"])) + result = ser.dt.normalize() expected = pd.to_datetime(Series(["1969-01-01", "2016-01-01"])) tm.assert_series_equal(result, expected) From 2d9bddfdb9f7962b746bd817d86b9123bc515145 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Nov 2021 17:45:18 -0800 Subject: [PATCH 08/53] REF: re-remove _putmask_preserve (#44346) --- pandas/core/array_algos/putmask.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 54324bf721945..77e38e6c6e3fc 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -126,7 +126,8 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd if values.dtype.kind == new.dtype.kind: # preserves dtype if possible - return _putmask_preserve(values, new, mask) + np.putmask(values, mask, new) + return values dtype = find_common_type([values.dtype, new.dtype]) # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type @@ -135,15 +136,8 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd # List[Any], _DTypeDict, Tuple[Any, Any]]]" values = values.astype(dtype) # type: ignore[arg-type] - return _putmask_preserve(values, new, mask) - - -def _putmask_preserve(new_values: np.ndarray, new, mask: npt.NDArray[np.bool_]): - try: - new_values[mask] = new[mask] - except (IndexError, ValueError): - new_values[mask] = new - return new_values + np.putmask(values, mask, new) + return values def putmask_without_repeat( From c3cefa714619b610e456cad447c91144f7b6cfc7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Nov 2021 17:47:02 -0800 Subject: [PATCH 09/53] TST: enable 2D tests for Categorical (#44206) --- pandas/core/arrays/categorical.py | 57 ++++++++++------------ pandas/tests/extension/test_categorical.py | 11 +++++ 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 609543a261a1c..145ff60a28f46 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,7 +6,6 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, - Any, Hashable, Sequence, TypeVar, @@ -38,10 +37,6 @@ Dtype, NpDtype, Ordered, - PositionalIndexer2D, - PositionalIndexerTuple, - ScalarIndexer, - SequenceIndexer, Shape, npt, type_t, @@ -102,7 +97,10 @@ take_nd, unique1d, ) -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import ( + NDArrayBackedExtensionArray, + ravel_compat, +) from pandas.core.base import ( ExtensionArray, NoNewAttributesMixin, @@ -113,7 +111,6 @@ extract_array, sanitize_array, ) -from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -1479,6 +1476,7 @@ def _validate_scalar(self, fill_value): # ------------------------------------------------------------- + @ravel_compat def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """ The numpy array interface. @@ -1929,7 +1927,10 @@ def __iter__(self): """ Returns an Iterator over the values of this Categorical. """ - return iter(self._internal_get_values().tolist()) + if self.ndim == 1: + return iter(self._internal_get_values().tolist()) + else: + return (self[n] for n in range(len(self))) def __contains__(self, key) -> bool: """ @@ -2048,27 +2049,6 @@ def __repr__(self) -> str: # ------------------------------------------------------------------ - @overload - def __getitem__(self, key: ScalarIndexer) -> Any: - ... - - @overload - def __getitem__( - self: CategoricalT, - key: SequenceIndexer | PositionalIndexerTuple, - ) -> CategoricalT: - ... - - def __getitem__(self: CategoricalT, key: PositionalIndexer2D) -> CategoricalT | Any: - """ - Return an item. - """ - result = super().__getitem__(key) - if getattr(result, "ndim", 0) > 1: - result = result._ndarray - deprecate_ndim_indexing(result) - return result - def _validate_listlike(self, value): # NB: here we assume scalar-like tuples have already been excluded value = extract_array(value, extract_numpy=True) @@ -2306,7 +2286,19 @@ def _concat_same_type( ) -> CategoricalT: from pandas.core.dtypes.concat import union_categoricals - return union_categoricals(to_concat) + result = union_categoricals(to_concat) + + # in case we are concatenating along axis != 0, we need to reshape + # the result from union_categoricals + first = to_concat[0] + if axis >= first.ndim: + raise ValueError + if axis == 1: + if not all(len(x) == len(first) for x in to_concat): + raise ValueError + # TODO: Will this get contiguity wrong? + result = result.reshape(-1, len(to_concat), order="F") + return result # ------------------------------------------------------------------ @@ -2694,6 +2686,11 @@ def _get_codes_for_values(values, categories: Index) -> np.ndarray: """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) + if values.ndim > 1: + flat = values.ravel() + codes = _get_codes_for_values(flat, categories) + return codes.reshape(values.shape) + if isinstance(categories.dtype, ExtensionDtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index e9dc63e9bd903..6a1a9512bc036 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -303,3 +303,14 @@ def test_not_equal_with_na(self, categories): class TestParsing(base.BaseParsingTests): pass + + +class Test2DCompat(base.Dim2CompatTests): + def test_repr_2d(self, data): + # Categorical __repr__ doesn't include "Categorical", so we need + # to special-case + res = repr(data.reshape(1, -1)) + assert res.count("\nCategories") == 1 + + res = repr(data.reshape(-1, 1)) + assert res.count("\nCategories") == 1 From b0c1671d73c5482cf0c6176eb22d4f6705e8a069 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Nov 2021 20:25:10 +0100 Subject: [PATCH 10/53] TST/COMPAT: update csv test to infer time with pyarrow>=6.0 (#44381) --- pandas/compat/pyarrow.py | 2 ++ pandas/tests/io/parser/test_parse_dates.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 9bf7139769baa..f9b9409317774 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -12,9 +12,11 @@ pa_version_under3p0 = _palv < Version("3.0.0") pa_version_under4p0 = _palv < Version("4.0.0") pa_version_under5p0 = _palv < Version("5.0.0") + pa_version_under6p0 = _palv < Version("6.0.0") except ImportError: pa_version_under1p0 = True pa_version_under2p0 = True pa_version_under3p0 = True pa_version_under4p0 = True pa_version_under5p0 = True + pa_version_under6p0 = True diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 17c107814995c..c8bea9592e82a 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -26,6 +26,7 @@ is_platform_windows, np_array_datetime64_compat, ) +from pandas.compat.pyarrow import pa_version_under6p0 import pandas as pd from pandas import ( @@ -431,6 +432,11 @@ def test_date_col_as_index_col(all_parsers): columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index, ) + if parser.engine == "pyarrow" and not pa_version_under6p0: + # https://github.com/pandas-dev/pandas/issues/44231 + # pyarrow 6.0 starts to infer time type + expected["X2"] = pd.to_datetime("1970-01-01" + expected["X2"]).dt.time + tm.assert_frame_equal(result, expected) From 5694f7899b39794f5078b23247d3b783bd9b65e7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 10 Nov 2021 19:17:09 -0500 Subject: [PATCH 11/53] TST: Make tests for groupby median/mean more strict on dtype (#44374) --- pandas/tests/groupby/test_function.py | 3 +-- pandas/tests/resample/test_datetime_index.py | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3c402480ea2ec..e5870a206f419 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -394,8 +394,7 @@ def test_median_empty_bins(observed): result = df.groupby(bins, observed=observed).median() expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) - # TODO: GH 41137 - tm.assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8436c2db445ee..34e8e2ac3e84a 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1692,8 +1692,6 @@ def f(data, add_arg): df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) expected = df.groupby("A").resample("D").mean().multiply(multiplier) - # TODO: GH 41137 - expected = expected.astype("float64") tm.assert_frame_equal(result, expected) From fded332ee0b8f1ab7c65e934a177edb1c0695c49 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 10 Nov 2021 16:39:48 -0800 Subject: [PATCH 12/53] TST: make get_upcast_box more flexible (#44385) --- pandas/_testing/__init__.py | 2 +- pandas/tests/arithmetic/common.py | 31 +++++++++++---------- pandas/tests/arithmetic/test_datetime64.py | 21 ++++++-------- pandas/tests/arithmetic/test_period.py | 19 +++++++------ pandas/tests/arithmetic/test_timedelta64.py | 16 +++++------ 5 files changed, 44 insertions(+), 45 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index e8283a222d86a..c2c55a4060f7a 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -259,7 +259,7 @@ def box_expected(expected, box_cls, transpose=True): expected = DatetimeArray(expected) elif box_cls is TimedeltaArray: expected = TimedeltaArray(expected) - elif box_cls is np.ndarray: + elif box_cls is np.ndarray or box_cls is np.array: expected = np.array(expected) elif box_cls is to_array: expected = to_array(expected) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index 6f4e35ad4dfb2..af70cdfe538bb 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -34,26 +34,29 @@ def assert_invalid_addsub_type(left, right, msg=None): right - left -def get_expected_box(box): +def get_upcast_box(left, right, is_cmp: bool = False): """ - Get the box to use for 'expected' in a comparison operation. - """ - if box in [Index, array]: - return np.ndarray - return box - + Get the box to use for 'expected' in an arithmetic or comparison operation. -def get_upcast_box(box, vector): - """ - Given two box-types, find the one that takes priority. + Parameters + left : Any + right : Any + is_cmp : bool, default False + Whether the operation is a comparison method. """ - if box is DataFrame or isinstance(vector, DataFrame): + + if isinstance(left, DataFrame) or isinstance(right, DataFrame): return DataFrame - if box is Series or isinstance(vector, Series): + if isinstance(left, Series) or isinstance(right, Series): + if is_cmp and isinstance(left, Index): + # Index does not defer for comparisons + return np.array return Series - if box is Index or isinstance(vector, Index): + if isinstance(left, Index) or isinstance(right, Index): + if is_cmp: + return np.array return Index - return box + return tm.to_array def assert_invalid_comparison(left, right, box): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 82f1e60f0aea5..44a70d3933b66 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -43,7 +43,6 @@ from pandas.tests.arithmetic.common import ( assert_invalid_addsub_type, assert_invalid_comparison, - get_expected_box, get_upcast_box, ) @@ -60,12 +59,12 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): # Test comparison with zero-dimensional array is unboxed tz = tz_naive_fixture box = box_with_array - xbox = get_expected_box(box) dti = date_range("20130101", periods=3, tz=tz) other = np.array(dti.to_numpy()[0]) dtarr = tm.box_expected(dti, box) + xbox = get_upcast_box(dtarr, other, True) result = dtarr <= other expected = np.array([True, False, False]) expected = tm.box_expected(expected, xbox) @@ -147,12 +146,12 @@ def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly tz = tz_naive_fixture box = box_with_array - xbox = get_expected_box(box) ts = Timestamp.now(tz) ser = Series([ts, NaT]) obj = tm.box_expected(ser, box) + xbox = get_upcast_box(obj, ts, True) expected = Series([True, False], dtype=np.bool_) expected = tm.box_expected(expected, xbox) @@ -244,10 +243,9 @@ def test_nat_comparisons_scalar(self, dtype, data, box_with_array): # on older numpys (since they check object identity) return - xbox = get_expected_box(box) - left = Series(data, dtype=dtype) left = tm.box_expected(left, box) + xbox = get_upcast_box(left, NaT, True) expected = [False, False, False] expected = tm.box_expected(expected, xbox) @@ -323,10 +321,10 @@ def test_timestamp_compare_series(self, left, right): def test_dt64arr_timestamp_equality(self, box_with_array): # GH#11034 - xbox = get_expected_box(box_with_array) ser = Series([Timestamp("2000-01-29 01:59:00"), Timestamp("2000-01-30"), NaT]) ser = tm.box_expected(ser, box_with_array) + xbox = get_upcast_box(ser, ser, True) result = ser != ser expected = tm.box_expected([False, False, True], xbox) @@ -417,13 +415,12 @@ def test_dti_cmp_nat(self, dtype, box_with_array): # on older numpys (since they check object identity) return - xbox = get_expected_box(box_with_array) - left = DatetimeIndex([Timestamp("2011-01-01"), NaT, Timestamp("2011-01-03")]) right = DatetimeIndex([NaT, NaT, Timestamp("2011-01-03")]) left = tm.box_expected(left, box_with_array) right = tm.box_expected(right, box_with_array) + xbox = get_upcast_box(left, right, True) lhs, rhs = left, right if dtype is object: @@ -642,12 +639,11 @@ def test_scalar_comparison_tzawareness( self, comparison_op, other, tz_aware_fixture, box_with_array ): op = comparison_op - box = box_with_array tz = tz_aware_fixture dti = date_range("2016-01-01", periods=2, tz=tz) - xbox = get_expected_box(box) dtarr = tm.box_expected(dti, box_with_array) + xbox = get_upcast_box(dtarr, other, True) if op in [operator.eq, operator.ne]: exbool = op is operator.ne expected = np.array([exbool, exbool], dtype=bool) @@ -2421,14 +2417,13 @@ def test_dti_addsub_offset_arraylike( self, tz_naive_fixture, names, op, index_or_series ): # GH#18849, GH#19744 - box = pd.Index other_box = index_or_series tz = tz_naive_fixture dti = date_range("2017-01-01", periods=2, tz=tz, name=names[0]) other = other_box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - xbox = get_upcast_box(box, other) + xbox = get_upcast_box(dti, other) with tm.assert_produces_warning(PerformanceWarning): res = op(dti, other) @@ -2448,7 +2443,7 @@ def test_dti_addsub_object_arraylike( dti = date_range("2017-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) other = other_box([pd.offsets.MonthEnd(), Timedelta(days=4)]) - xbox = get_upcast_box(box_with_array, other) + xbox = get_upcast_box(dtarr, other) expected = DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 41c2cb2cc4f1e..f8814a33292ec 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -27,7 +27,7 @@ from pandas.core.arrays import TimedeltaArray from pandas.tests.arithmetic.common import ( assert_invalid_comparison, - get_expected_box, + get_upcast_box, ) # ------------------------------------------------------------------ @@ -41,12 +41,13 @@ class TestPeriodArrayLikeComparisons: def test_compare_zerodim(self, box_with_array): # GH#26689 make sure we unbox zero-dimensional arrays - xbox = get_expected_box(box_with_array) pi = period_range("2000", periods=4) other = np.array(pi.to_numpy()[0]) pi = tm.box_expected(pi, box_with_array) + xbox = get_upcast_box(pi, other, True) + result = pi <= other expected = np.array([True, False, False, False]) expected = tm.box_expected(expected, xbox) @@ -78,11 +79,11 @@ def test_compare_invalid_listlike(self, box_with_array, other): @pytest.mark.parametrize("other_box", [list, np.array, lambda x: x.astype(object)]) def test_compare_object_dtype(self, box_with_array, other_box): - xbox = get_expected_box(box_with_array) pi = period_range("2000", periods=5) parr = tm.box_expected(pi, box_with_array) other = other_box(pi) + xbox = get_upcast_box(parr, other, True) expected = np.array([True, True, True, True, True]) expected = tm.box_expected(expected, xbox) @@ -195,14 +196,15 @@ def test_pi_cmp_period(self): # TODO: moved from test_datetime64; de-duplicate with version below def test_parr_cmp_period_scalar2(self, box_with_array): - xbox = get_expected_box(box_with_array) - pi = period_range("2000-01-01", periods=10, freq="D") val = Period("2000-01-04", freq="D") + expected = [x > val for x in pi] ser = tm.box_expected(pi, box_with_array) + xbox = get_upcast_box(ser, val, True) + expected = tm.box_expected(expected, xbox) result = ser > val tm.assert_equal(result, expected) @@ -216,11 +218,10 @@ def test_parr_cmp_period_scalar2(self, box_with_array): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_period_scalar(self, freq, box_with_array): # GH#13200 - xbox = get_expected_box(box_with_array) - base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) per = Period("2011-02", freq=freq) + xbox = get_upcast_box(base, per, True) exp = np.array([False, True, False, False]) exp = tm.box_expected(exp, xbox) @@ -255,14 +256,14 @@ def test_parr_cmp_period_scalar(self, freq, box_with_array): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_pi(self, freq, box_with_array): # GH#13200 - xbox = get_expected_box(box_with_array) - base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) # TODO: could also box idx? idx = PeriodIndex(["2011-02", "2011-01", "2011-03", "2011-05"], freq=freq) + xbox = get_upcast_box(base, idx, True) + exp = np.array([False, False, True, False]) exp = tm.box_expected(exp, xbox) tm.assert_equal(base == idx, exp) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index b8fa6c79b1b93..86980ad42766e 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1542,13 +1542,13 @@ def test_tdi_mul_float_series(self, box_with_array): ) def test_tdi_rmul_arraylike(self, other, box_with_array): box = box_with_array - xbox = get_upcast_box(box, other) tdi = TimedeltaIndex(["1 Day"] * 10) - expected = timedelta_range("1 days", "10 days") - expected._data.freq = None + expected = timedelta_range("1 days", "10 days")._with_freq(None) tdi = tm.box_expected(tdi, box) + xbox = get_upcast_box(tdi, other) + expected = tm.box_expected(expected, xbox) result = other * tdi @@ -2000,7 +2000,6 @@ def test_td64arr_rmul_numeric_array( ): # GH#4521 # divide/multiply by integers - xbox = get_upcast_box(box_with_array, vector) tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_numpy_dtype) @@ -2008,6 +2007,8 @@ def test_td64arr_rmul_numeric_array( expected = Series(["1180 Days", "1770 Days", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) + xbox = get_upcast_box(tdser, vector) + expected = tm.box_expected(expected, xbox) result = tdser * vector @@ -2026,7 +2027,6 @@ def test_td64arr_div_numeric_array( ): # GH#4521 # divide/multiply by integers - xbox = get_upcast_box(box_with_array, vector) tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_numpy_dtype) @@ -2034,6 +2034,7 @@ def test_td64arr_div_numeric_array( expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) + xbox = get_upcast_box(tdser, vector) expected = tm.box_expected(expected, xbox) result = tdser / vector @@ -2085,7 +2086,7 @@ def test_td64arr_mul_int_series(self, box_with_array, names): ) tdi = tm.box_expected(tdi, box) - xbox = get_upcast_box(box, ser) + xbox = get_upcast_box(tdi, ser) expected = tm.box_expected(expected, xbox) @@ -2117,9 +2118,8 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names): name=xname, ) - xbox = get_upcast_box(box, ser) - tdi = tm.box_expected(tdi, box) + xbox = get_upcast_box(tdi, ser) expected = tm.box_expected(expected, xbox) result = ser.__rtruediv__(tdi) From 58081354fddd5b7c051a6fbcb0c11bec26b16e80 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 10 Nov 2021 18:48:10 -0800 Subject: [PATCH 13/53] CI: Use conda-forge to create Python 3.10 env (#44388) --- .github/workflows/sdist.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml index 7692dc522522f..92a9f2a5fb97c 100644 --- a/.github/workflows/sdist.yml +++ b/.github/workflows/sdist.yml @@ -53,6 +53,7 @@ jobs: - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: pandas-sdist + channels: conda-forge python-version: '${{ matrix.python-version }}' - name: Install pandas from sdist From 5a9d5848783921a06b4cf44636ffe361d62c31fd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Nov 2021 05:40:52 -0800 Subject: [PATCH 14/53] CLN: misplaced indexing tests (#44375) --- pandas/tests/arithmetic/test_datetime64.py | 33 +++++++++++++++++ .../indexes/datetimes/test_partial_slicing.py | 35 ------------------- pandas/tests/indexes/period/test_indexing.py | 7 +--- pandas/tests/indexes/period/test_period.py | 6 ++-- pandas/tests/indexes/timedeltas/test_ops.py | 11 ------ pandas/tests/indexing/test_loc.py | 6 ++++ pandas/tests/series/indexing/test_indexing.py | 14 -------- pandas/tests/series/test_api.py | 9 +++++ pandas/tests/series/test_constructors.py | 15 ++++++++ 9 files changed, 67 insertions(+), 69 deletions(-) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 44a70d3933b66..bff461dbc7038 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -359,6 +359,39 @@ def test_dt64arr_timestamp_equality(self, box_with_array): expected = tm.box_expected([False, False, False], xbox) tm.assert_equal(result, expected) + @pytest.mark.parametrize( + "datetimelike", + [ + Timestamp("20130101"), + datetime(2013, 1, 1), + np.datetime64("2013-01-01T00:00", "ns"), + ], + ) + @pytest.mark.parametrize( + "op,expected", + [ + (operator.lt, [True, False, False, False]), + (operator.le, [True, True, False, False]), + (operator.eq, [False, True, False, False]), + (operator.gt, [False, False, False, True]), + ], + ) + def test_dt64_compare_datetime_scalar(self, datetimelike, op, expected): + # GH#17965, test for ability to compare datetime64[ns] columns + # to datetimelike + ser = Series( + [ + Timestamp("20120101"), + Timestamp("20130101"), + np.nan, + Timestamp("20130103"), + ], + name="A", + ) + result = op(ser, datetimelike) + expected = Series(expected, name="A") + tm.assert_series_equal(result, expected) + class TestDatetimeIndexComparisons: diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 896c43db5e356..2f32f9e18311d 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -1,7 +1,6 @@ """ test partial slicing on Series/Frame """ from datetime import datetime -import operator import numpy as np import pytest @@ -412,40 +411,6 @@ def test_loc_datetime_length_one(self): result = df.loc["2016-10-01T00:00:00":] tm.assert_frame_equal(result, df) - @pytest.mark.parametrize( - "datetimelike", - [ - Timestamp("20130101"), - datetime(2013, 1, 1), - np.datetime64("2013-01-01T00:00", "ns"), - ], - ) - @pytest.mark.parametrize( - "op,expected", - [ - (operator.lt, [True, False, False, False]), - (operator.le, [True, True, False, False]), - (operator.eq, [False, True, False, False]), - (operator.gt, [False, False, False, True]), - ], - ) - def test_selection_by_datetimelike(self, datetimelike, op, expected): - # GH issue #17965, test for ability to compare datetime64[ns] columns - # to datetimelike - df = DataFrame( - { - "A": [ - Timestamp("20120101"), - Timestamp("20130101"), - np.nan, - Timestamp("20130103"), - ] - } - ) - result = op(df.A, datetimelike) - expected = Series(expected, name="A") - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "start", [ diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index dfa750bf933a0..1b5e64bca03a0 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -205,6 +205,7 @@ def test_getitem_seconds(self): # GH7116 # these show deprecations as we are trying # to slice with non-integer indexers + # FIXME: don't leave commented-out # with pytest.raises(IndexError): # idx[v] continue @@ -814,12 +815,6 @@ def test_get_value(self): result2 = idx2.get_value(input2, p1) tm.assert_series_equal(result2, expected2) - def test_loc_str(self): - # https://github.com/pandas-dev/pandas/issues/33964 - index = period_range(start="2000", periods=20, freq="B") - series = Series(range(20), index=index) - assert series.loc["2000-01-14"] == 9 - @pytest.mark.parametrize("freq", ["H", "D"]) def test_get_value_datetime_hourly(self, freq): # get_loc and get_value should treat datetime objects symmetrically diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index e6c31d22e626f..a7dad4e7f352c 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -211,7 +211,7 @@ def _check_all_fields(self, periodindex): ] periods = list(periodindex) - s = Series(periodindex) + ser = Series(periodindex) for field in fields: field_idx = getattr(periodindex, field) @@ -219,10 +219,10 @@ def _check_all_fields(self, periodindex): for x, val in zip(periods, field_idx): assert getattr(x, field) == val - if len(s) == 0: + if len(ser) == 0: continue - field_s = getattr(s.dt, field) + field_s = getattr(ser.dt, field) assert len(periodindex) == len(field_s) for x, val in zip(periods, field_s): assert getattr(x, field) == val diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 2a5051b2982bb..f5d601bcfbcd1 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -1,8 +1,6 @@ -import numpy as np import pytest from pandas import ( - Series, TimedeltaIndex, timedelta_range, ) @@ -30,15 +28,6 @@ def test_nonunique_contains(self): ): assert idx[0] in idx - def test_unknown_attribute(self): - # see gh-9680 - tdi = timedelta_range(start=0, periods=10, freq="1s") - ts = Series(np.random.normal(size=10), index=tdi) - assert "foo" not in ts.__dict__.keys() - msg = "'Series' object has no attribute 'foo'" - with pytest.raises(AttributeError, match=msg): - ts.foo - def test_infer_freq(self, freq_sample): # GH#11018 idx = timedelta_range("1", freq=freq_sample, periods=10) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b0aa05371271b..ed9b5cc0850b9 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2941,3 +2941,9 @@ def test_loc_set_multiple_items_in_multiple_new_columns(self): ) tm.assert_frame_equal(df, expected) + + def test_getitem_loc_str_periodindex(self): + # GH#33964 + index = pd.period_range(start="2000", periods=20, freq="B") + series = Series(range(20), index=index) + assert series.loc["2000-01-14"] == 9 diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index d77f831bee8bc..6c3587c7eeada 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -377,17 +377,3 @@ def test_frozenset_index(): assert s[idx1] == 2 s[idx1] = 3 assert s[idx1] == 3 - - -def test_boolean_index(): - # GH18579 - s1 = Series([1, 2, 3], index=[4, 5, 6]) - s2 = Series([1, 3, 2], index=s1 == 2) - tm.assert_series_equal(Series([1, 3, 2], [False, True, False]), s2) - - -def test_index_ndim_gt_1_raises(): - # GH18579 - df = DataFrame([[1, 2], [3, 4], [5, 6]], index=[3, 6, 9]) - with pytest.raises(ValueError, match="Index data must be 1-dimensional"): - Series([1, 3, 2], index=df) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index b49c209a59a06..aaf98e46f2f09 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -182,3 +182,12 @@ def test_inspect_getmembers(self): ser = Series(dtype=object) with tm.assert_produces_warning(None): inspect.getmembers(ser) + + def test_unknown_attribute(self): + # GH#9680 + tdi = pd.timedelta_range(start=0, periods=10, freq="1s") + ser = Series(np.random.normal(size=10), index=tdi) + assert "foo" not in ser.__dict__.keys() + msg = "'Series' object has no attribute 'foo'" + with pytest.raises(AttributeError, match=msg): + ser.foo diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 2c33284df18c5..1b488b4cf0b77 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -154,6 +154,12 @@ def test_constructor(self, datetime_series): with pytest.raises(NotImplementedError, match=msg): Series(m) + def test_constructor_index_ndim_gt_1_raises(self): + # GH#18579 + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=[3, 6, 9]) + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + Series([1, 3, 2], index=df) + @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): with tm.assert_produces_warning(FutureWarning): @@ -276,6 +282,15 @@ def test_constructor_list_like(self): result = Series(obj, index=[0, 1, 2]) tm.assert_series_equal(result, expected) + def test_constructor_boolean_index(self): + # GH#18579 + s1 = Series([1, 2, 3], index=[4, 5, 6]) + + index = s1 == 2 + result = Series([1, 3, 2], index=index) + expected = Series([1, 3, 2], index=[False, True, False]) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"]) def test_constructor_index_dtype(self, dtype): # GH 17088 From ae4a888eac41cd33fd59f5712cbd73c4c958b9d7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Nov 2021 05:41:51 -0800 Subject: [PATCH 15/53] BUG: PeriodIndex[B].to_timestamp inferring "D" instead of "B". (#44105) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/period.py | 18 +++++++++++++++++- pandas/tests/arrays/test_datetimelike.py | 19 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 99a66c7e5454b..8732e1c397ce5 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -579,6 +579,7 @@ I/O Period ^^^^^^ - Bug in adding a :class:`Period` object to a ``np.timedelta64`` object incorrectly raising ``TypeError`` (:issue:`44182`) +- Bug in :meth:`PeriodIndex.to_timestamp` when the index has ``freq="B"`` inferring ``freq="D"`` for its result instead of ``freq="B"`` (:issue:`44105`) - Plotting diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2f36b72229225..01018c7263f32 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -12,6 +12,7 @@ import numpy as np +from pandas._libs import algos as libalgos from pandas._libs.arrays import NDArrayBacked from pandas._libs.tslibs import ( BaseOffset, @@ -506,7 +507,22 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: new_parr = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) - return DatetimeArray(new_data)._with_freq("infer") + dta = DatetimeArray(new_data) + + if self.freq.name == "B": + # See if we can retain BDay instead of Day in cases where + # len(self) is too small for infer_freq to distinguish between them + diffs = libalgos.unique_deltas(self.asi8) + if len(diffs) == 1: + diff = diffs[0] + if diff == self.freq.n: + dta._freq = self.freq + elif diff == 1: + dta._freq = self.freq.base + # TODO: other cases? + return dta + else: + return dta._with_freq("infer") # -------------------------------------------------------------------- diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index c7c1ce6c04692..13fe3c2d427c5 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1114,6 +1114,25 @@ def test_to_timestamp(self, how, arr1d): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + def test_to_timestamp_roundtrip_bday(self): + # Case where infer_freq inside would choose "D" instead of "B" + dta = pd.date_range("2021-10-18", periods=3, freq="B")._data + parr = dta.to_period() + result = parr.to_timestamp() + assert result.freq == "B" + tm.assert_extension_array_equal(result, dta) + + dta2 = dta[::2] + parr2 = dta2.to_period() + result2 = parr2.to_timestamp() + assert result2.freq == "2B" + tm.assert_extension_array_equal(result2, dta2) + + parr3 = dta.to_period("2B") + result3 = parr3.to_timestamp() + assert result3.freq == "B" + tm.assert_extension_array_equal(result3, dta) + def test_to_timestamp_out_of_bounds(self): # GH#19643 previously overflowed silently pi = pd.period_range("1500", freq="Y", periods=3) From c4316b5a61e1baadd76811b112ac9043980c378a Mon Sep 17 00:00:00 2001 From: LunarLanding <4441338+LunarLanding@users.noreply.github.com> Date: Thu, 11 Nov 2021 14:15:28 +0000 Subject: [PATCH 16/53] Fix header options (#44391) --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 49c2b28207ed5..6d3cc84a31d05 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -86,7 +86,7 @@ delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``. delimiter : str, default ``None`` Alias for sep. -header : int, list of int, default 'infer' +header : int, list of int, None, default 'infer' Row number(s) to use as the column names, and the start of the data. Default behavior is to infer the column names: if no names are passed the behavior is identical to ``header=0`` and column From 885a1c49f1e78fb34978f52ce907bbcbe83ba863 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Nov 2021 09:50:22 -0800 Subject: [PATCH 17/53] ENH: implement EA._putmask (#44387) --- pandas/core/arrays/_mixins.py | 2 +- pandas/core/arrays/base.py | 27 +++++++++++++++++++++++++++ pandas/core/arrays/interval.py | 7 ++++--- pandas/core/indexes/base.py | 6 ++---- pandas/core/internals/blocks.py | 8 +++----- 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 8deeb44f65188..674379f6d65f8 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -310,7 +310,7 @@ def _wrap_reduction_result(self, axis: int | None, result): # ------------------------------------------------------------------------ # __array_function__ methods - def putmask(self, mask: np.ndarray, value) -> None: + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: """ Analogue to np.putmask(self, mask, value) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 70841197761a9..a64aef64ab49f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1409,6 +1409,33 @@ def insert(self: ExtensionArrayT, loc: int, item) -> ExtensionArrayT: return type(self)._concat_same_type([self[:loc], item_arr, self[loc:]]) + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: + """ + Analogue to np.putmask(self, mask, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + If listlike, must be arraylike with same length as self. + + Returns + ------- + None + + Notes + ----- + Unlike np.putmask, we do not repeat listlike values with mismatched length. + 'value' should either be a scalar or an arraylike with the same length + as self. + """ + if is_list_like(value): + val = value[mask] + else: + val = value + + self[mask] = val + def _where( self: ExtensionArrayT, mask: npt.NDArray[np.bool_], value ) -> ExtensionArrayT: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index d5718d59bf8b0..01bf5ec0633b5 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -36,6 +36,7 @@ PositionalIndexer, ScalarIndexer, SequenceIndexer, + npt, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -1482,15 +1483,15 @@ def to_tuples(self, na_tuple=True) -> np.ndarray: # --------------------------------------------------------------------- - def putmask(self, mask: np.ndarray, value) -> None: + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: value_left, value_right = self._validate_setitem_value(value) if isinstance(self._left, np.ndarray): np.putmask(self._left, mask, value_left) np.putmask(self._right, mask, value_right) else: - self._left.putmask(mask, value_left) - self._right.putmask(mask, value_right) + self._left._putmask(mask, value_left) + self._right._putmask(mask, value_right) def insert(self: IntervalArrayT, loc: int, item: Interval) -> IntervalArrayT: """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ba7dde7d2a4d8..2514702b036dd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4444,8 +4444,7 @@ def _join_non_unique( if isinstance(join_array, np.ndarray): np.putmask(join_array, mask, right) else: - # error: "ExtensionArray" has no attribute "putmask" - join_array.putmask(mask, right) # type: ignore[attr-defined] + join_array._putmask(mask, right) join_index = self._wrap_joined_index(join_array, other) @@ -5051,8 +5050,7 @@ def putmask(self, mask, value) -> Index: else: # Note: we use the original value here, not converted, as # _validate_fill_value is not idempotent - # error: "ExtensionArray" has no attribute "putmask" - values.putmask(mask, value) # type: ignore[attr-defined] + values._putmask(mask, value) return self._shallow_copy(values) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2589015e0f0b1..66a40b962e183 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1415,15 +1415,13 @@ def putmask(self, mask, new) -> list[Block]: new_values = self.values - if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): - new = new[mask] - if mask.ndim == new_values.ndim + 1: # TODO(EA2D): unnecessary with 2D EAs mask = mask.reshape(new_values.shape) try: - new_values[mask] = new + # Caller is responsible for ensuring matching lengths + new_values._putmask(mask, new) except TypeError: if not is_interval_dtype(self.dtype): # Discussion about what we want to support in the general @@ -1704,7 +1702,7 @@ def putmask(self, mask, new) -> list[Block]: return self.coerce_to_target_dtype(new).putmask(mask, new) arr = self.values - arr.T.putmask(mask, new) + arr.T._putmask(mask, new) return [self] def where(self, other, cond) -> list[Block]: From 9db50bffab4359d31b2616f5c29642c43b50c5c6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Nov 2021 09:51:01 -0800 Subject: [PATCH 18/53] collect partial tests (#44372) --- pandas/tests/indexing/test_partial.py | 427 +++++++++++++------------- 1 file changed, 216 insertions(+), 211 deletions(-) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index c487777fc339e..82d55a7bf7189 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -22,6 +22,213 @@ import pandas._testing as tm +class TestEmptyFrameSetitemExpansion: + def test_empty_frame_setitem_index_name_retained(self): + # GH#31368 empty frame has non-None index.name -> retained + df = DataFrame({}, index=pd.RangeIndex(0, name="df_index")) + series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) + + df["series"] = series + expected = DataFrame( + {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") + ) + + tm.assert_frame_equal(df, expected) + + def test_empty_frame_setitem_index_name_inherited(self): + # GH#36527 empty frame has None index.name -> not retained + df = DataFrame() + series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) + df["series"] = series + expected = DataFrame( + {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") + ) + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_zerolen_series_columns_align(self): + # columns will align + df = DataFrame(columns=["A", "B"]) + df.loc[0] = Series(1, index=range(4)) + expected = DataFrame(columns=["A", "B"], index=[0], dtype=np.float64) + tm.assert_frame_equal(df, expected) + + # columns will align + df = DataFrame(columns=["A", "B"]) + df.loc[0] = Series(1, index=["B"]) + + exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64") + tm.assert_frame_equal(df, exp) + + def test_loc_setitem_zerolen_list_length_must_match_columns(self): + # list-like must conform + df = DataFrame(columns=["A", "B"]) + + msg = "cannot set a row with mismatched columns" + with pytest.raises(ValueError, match=msg): + df.loc[0] = [1, 2, 3] + + df = DataFrame(columns=["A", "B"]) + df.loc[3] = [6, 7] # length matches len(df.columns) --> OK! + + exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=np.int64) + tm.assert_frame_equal(df, exp) + + def test_partial_set_empty_frame(self): + + # partially set with an empty object + # frame + df = DataFrame() + + msg = "cannot set a frame with no defined columns" + + with pytest.raises(ValueError, match=msg): + df.loc[1] = 1 + + with pytest.raises(ValueError, match=msg): + df.loc[1] = Series([1], index=["foo"]) + + msg = "cannot set a frame with no defined index and a scalar" + with pytest.raises(ValueError, match=msg): + df.loc[:, 1] = 1 + + def test_partial_set_empty_frame2(self): + # these work as they don't really change + # anything but the index + # GH#5632 + expected = DataFrame(columns=["foo"], index=Index([], dtype="object")) + + df = DataFrame(index=Index([], dtype="object")) + df["foo"] = Series([], dtype="object") + + tm.assert_frame_equal(df, expected) + + df = DataFrame() + df["foo"] = Series(df.index) + + tm.assert_frame_equal(df, expected) + + df = DataFrame() + df["foo"] = df.index + + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame3(self): + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected["foo"] = expected["foo"].astype("float64") + + df = DataFrame(index=Index([], dtype="int64")) + df["foo"] = [] + + tm.assert_frame_equal(df, expected) + + df = DataFrame(index=Index([], dtype="int64")) + df["foo"] = Series(np.arange(len(df)), dtype="float64") + + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame4(self): + df = DataFrame(index=Index([], dtype="int64")) + df["foo"] = range(len(df)) + + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + # range is int-dtype-like, so we get int64 dtype + expected["foo"] = expected["foo"].astype("int64") + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame5(self): + df = DataFrame() + tm.assert_index_equal(df.columns, Index([], dtype=object)) + df2 = DataFrame() + df2[1] = Series([1], index=["foo"]) + df.loc[:, 1] = Series([1], index=["foo"]) + tm.assert_frame_equal(df, DataFrame([[1]], index=["foo"], columns=[1])) + tm.assert_frame_equal(df, df2) + + def test_partial_set_empty_frame_no_index(self): + # no index to start + expected = DataFrame({0: Series(1, index=range(4))}, columns=["A", "B", 0]) + + df = DataFrame(columns=["A", "B"]) + df[0] = Series(1, index=range(4)) + df.dtypes + str(df) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=["A", "B"]) + df.loc[:, 0] = Series(1, index=range(4)) + df.dtypes + str(df) + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame_row(self): + # GH#5720, GH#5744 + # don't create rows when empty + expected = DataFrame(columns=["A", "B", "New"], index=Index([], dtype="int64")) + expected["A"] = expected["A"].astype("int64") + expected["B"] = expected["B"].astype("float64") + expected["New"] = expected["New"].astype("float64") + + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) + y = df[df.A > 5] + y["New"] = np.nan + tm.assert_frame_equal(y, expected) + + expected = DataFrame(columns=["a", "b", "c c", "d"]) + expected["d"] = expected["d"].astype("int64") + df = DataFrame(columns=["a", "b", "c c"]) + df["d"] = 3 + tm.assert_frame_equal(df, expected) + tm.assert_series_equal(df["c c"], Series(name="c c", dtype=object)) + + # reindex columns is ok + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) + y = df[df.A > 5] + result = y.reindex(columns=["A", "B", "C"]) + expected = DataFrame(columns=["A", "B", "C"], index=Index([], dtype="int64")) + expected["A"] = expected["A"].astype("int64") + expected["B"] = expected["B"].astype("float64") + expected["C"] = expected["C"].astype("float64") + tm.assert_frame_equal(result, expected) + + def test_partial_set_empty_frame_set_series(self): + # GH#5756 + # setting with empty Series + df = DataFrame(Series(dtype=object)) + expected = DataFrame({0: Series(dtype=object)}) + tm.assert_frame_equal(df, expected) + + df = DataFrame(Series(name="foo", dtype=object)) + expected = DataFrame({"foo": Series(dtype=object)}) + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame_empty_copy_assignment(self): + # GH#5932 + # copy on empty with assignment fails + df = DataFrame(index=[0]) + df = df.copy() + df["a"] = 0 + expected = DataFrame(0, index=[0], columns=["a"]) + tm.assert_frame_equal(df, expected) + + def test_partial_set_empty_frame_empty_consistencies(self): + # GH#6171 + # consistency on empty frames + df = DataFrame(columns=["x", "y"]) + df["x"] = [1, 2] + expected = DataFrame({"x": [1, 2], "y": [np.nan, np.nan]}) + tm.assert_frame_equal(df, expected, check_dtype=False) + + df = DataFrame(columns=["x", "y"]) + df["x"] = ["1", "2"] + expected = DataFrame({"x": ["1", "2"], "y": [np.nan, np.nan]}, dtype=object) + tm.assert_frame_equal(df, expected) + + df = DataFrame(columns=["x", "y"]) + df.loc[0, "x"] = 1 + expected = DataFrame({"x": [1], "y": [np.nan]}) + tm.assert_frame_equal(df, expected, check_dtype=False) + + class TestPartialSetting: def test_partial_setting(self): @@ -61,8 +268,7 @@ def test_partial_setting(self): with pytest.raises(IndexError, match=msg): s.iat[3] = 5.0 - # ## frame ## - + def test_partial_setting_frame(self): df_orig = DataFrame( np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" ) @@ -166,33 +372,6 @@ def test_partial_setting_mixed_dtype(self): df.loc[2] = df.loc[1] tm.assert_frame_equal(df, expected) - # columns will align - df = DataFrame(columns=["A", "B"]) - df.loc[0] = Series(1, index=range(4)) - expected = DataFrame(columns=["A", "B"], index=[0], dtype=np.float64) - tm.assert_frame_equal(df, expected) - - # columns will align - # TODO: it isn't great that this behavior depends on consolidation - df = DataFrame(columns=["A", "B"])._consolidate() - df.loc[0] = Series(1, index=["B"]) - - exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64") - tm.assert_frame_equal(df, exp) - - # list-like must conform - df = DataFrame(columns=["A", "B"]) - - msg = "cannot set a row with mismatched columns" - with pytest.raises(ValueError, match=msg): - df.loc[0] = [1, 2, 3] - - df = DataFrame(columns=["A", "B"]) - df.loc[3] = [6, 7] - - exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype=np.int64) - tm.assert_frame_equal(df, exp) - def test_series_partial_set(self): # partial set with new index # Regression from GH4825 @@ -352,6 +531,7 @@ def test_setitem_with_expansion_numeric_into_datetimeindex(self, key): ex_index = Index(list(orig.index) + [key], dtype=object, name=orig.index.name) ex_data = np.concatenate([orig.values, df.iloc[[0]].values], axis=0) expected = DataFrame(ex_data, index=ex_index, columns=orig.columns) + tm.assert_frame_equal(df, expected) def test_partial_set_invalid(self): @@ -369,162 +549,6 @@ def test_partial_set_invalid(self): tm.assert_index_equal(df.index, Index(orig.index.tolist() + ["a"])) assert df.index.dtype == "object" - def test_partial_set_empty_frame(self): - - # partially set with an empty object - # frame - df = DataFrame() - - msg = "cannot set a frame with no defined columns" - - with pytest.raises(ValueError, match=msg): - df.loc[1] = 1 - - with pytest.raises(ValueError, match=msg): - df.loc[1] = Series([1], index=["foo"]) - - msg = "cannot set a frame with no defined index and a scalar" - with pytest.raises(ValueError, match=msg): - df.loc[:, 1] = 1 - - def test_partial_set_empty_frame2(self): - # these work as they don't really change - # anything but the index - # GH5632 - expected = DataFrame(columns=["foo"], index=Index([], dtype="object")) - - df = DataFrame(index=Index([], dtype="object")) - df["foo"] = Series([], dtype="object") - - tm.assert_frame_equal(df, expected) - - df = DataFrame() - df["foo"] = Series(df.index) - - tm.assert_frame_equal(df, expected) - - df = DataFrame() - df["foo"] = df.index - - tm.assert_frame_equal(df, expected) - - def test_partial_set_empty_frame3(self): - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) - expected["foo"] = expected["foo"].astype("float64") - - df = DataFrame(index=Index([], dtype="int64")) - df["foo"] = [] - - tm.assert_frame_equal(df, expected) - - df = DataFrame(index=Index([], dtype="int64")) - df["foo"] = Series(np.arange(len(df)), dtype="float64") - - tm.assert_frame_equal(df, expected) - - def test_partial_set_empty_frame4(self): - df = DataFrame(index=Index([], dtype="int64")) - df["foo"] = range(len(df)) - - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) - # range is int-dtype-like, so we get int64 dtype - expected["foo"] = expected["foo"].astype("int64") - tm.assert_frame_equal(df, expected) - - def test_partial_set_empty_frame5(self): - df = DataFrame() - tm.assert_index_equal(df.columns, Index([], dtype=object)) - df2 = DataFrame() - df2[1] = Series([1], index=["foo"]) - df.loc[:, 1] = Series([1], index=["foo"]) - tm.assert_frame_equal(df, DataFrame([[1]], index=["foo"], columns=[1])) - tm.assert_frame_equal(df, df2) - - def test_partial_set_empty_frame_no_index(self): - # no index to start - expected = DataFrame({0: Series(1, index=range(4))}, columns=["A", "B", 0]) - - df = DataFrame(columns=["A", "B"]) - df[0] = Series(1, index=range(4)) - df.dtypes - str(df) - tm.assert_frame_equal(df, expected) - - df = DataFrame(columns=["A", "B"]) - df.loc[:, 0] = Series(1, index=range(4)) - df.dtypes - str(df) - tm.assert_frame_equal(df, expected) - - def test_partial_set_empty_frame_row(self): - # GH5720, GH5744 - # don't create rows when empty - expected = DataFrame(columns=["A", "B", "New"], index=Index([], dtype="int64")) - expected["A"] = expected["A"].astype("int64") - expected["B"] = expected["B"].astype("float64") - expected["New"] = expected["New"].astype("float64") - - df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) - y = df[df.A > 5] - y["New"] = np.nan - tm.assert_frame_equal(y, expected) - # tm.assert_frame_equal(y,expected) - - expected = DataFrame(columns=["a", "b", "c c", "d"]) - expected["d"] = expected["d"].astype("int64") - df = DataFrame(columns=["a", "b", "c c"]) - df["d"] = 3 - tm.assert_frame_equal(df, expected) - tm.assert_series_equal(df["c c"], Series(name="c c", dtype=object)) - - # reindex columns is ok - df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) - y = df[df.A > 5] - result = y.reindex(columns=["A", "B", "C"]) - expected = DataFrame(columns=["A", "B", "C"], index=Index([], dtype="int64")) - expected["A"] = expected["A"].astype("int64") - expected["B"] = expected["B"].astype("float64") - expected["C"] = expected["C"].astype("float64") - tm.assert_frame_equal(result, expected) - - def test_partial_set_empty_frame_set_series(self): - # GH 5756 - # setting with empty Series - df = DataFrame(Series(dtype=object)) - expected = DataFrame({0: Series(dtype=object)}) - tm.assert_frame_equal(df, expected) - - df = DataFrame(Series(name="foo", dtype=object)) - expected = DataFrame({"foo": Series(dtype=object)}) - tm.assert_frame_equal(df, expected) - - def test_partial_set_empty_frame_empty_copy_assignment(self): - # GH 5932 - # copy on empty with assignment fails - df = DataFrame(index=[0]) - df = df.copy() - df["a"] = 0 - expected = DataFrame(0, index=[0], columns=["a"]) - tm.assert_frame_equal(df, expected) - - def test_partial_set_empty_frame_empty_consistencies(self): - # GH 6171 - # consistency on empty frames - df = DataFrame(columns=["x", "y"]) - df["x"] = [1, 2] - expected = DataFrame({"x": [1, 2], "y": [np.nan, np.nan]}) - tm.assert_frame_equal(df, expected, check_dtype=False) - - df = DataFrame(columns=["x", "y"]) - df["x"] = ["1", "2"] - expected = DataFrame({"x": ["1", "2"], "y": [np.nan, np.nan]}, dtype=object) - tm.assert_frame_equal(df, expected) - - df = DataFrame(columns=["x", "y"]) - df.loc[0, "x"] = 1 - expected = DataFrame({"x": [1], "y": [np.nan]}) - tm.assert_frame_equal(df, expected, check_dtype=False) - @pytest.mark.parametrize( "idx,labels,expected_idx", [ @@ -584,14 +608,14 @@ def test_loc_with_list_of_strings_representing_datetimes_missing_value( self, idx, labels ): # GH 11278 - s = Series(range(20), index=idx) + ser = Series(range(20), index=idx) df = DataFrame(range(20), index=idx) msg = r"not in index" with pytest.raises(KeyError, match=msg): - s.loc[labels] + ser.loc[labels] with pytest.raises(KeyError, match=msg): - s[labels] + ser[labels] with pytest.raises(KeyError, match=msg): df.loc[labels] @@ -628,37 +652,18 @@ def test_loc_with_list_of_strings_representing_datetimes_not_matched_type( self, idx, labels, msg ): # GH 11278 - s = Series(range(20), index=idx) + ser = Series(range(20), index=idx) df = DataFrame(range(20), index=idx) with pytest.raises(KeyError, match=msg): - s.loc[labels] + ser.loc[labels] with pytest.raises(KeyError, match=msg): - s[labels] + ser[labels] with pytest.raises(KeyError, match=msg): df.loc[labels] - def test_index_name_empty(self): - # GH 31368 - df = DataFrame({}, index=pd.RangeIndex(0, name="df_index")) - series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) - - df["series"] = series - expected = DataFrame( - {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") - ) - - tm.assert_frame_equal(df, expected) - - # GH 36527 - df = DataFrame() - series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) - df["series"] = series - expected = DataFrame( - {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") - ) - tm.assert_frame_equal(df, expected) +class TestStringSlicing: def test_slice_irregular_datetime_index_with_nan(self): # GH36953 index = pd.to_datetime(["2012-01-01", "2012-01-02", "2012-01-03", None]) From e4ddd2d47e1d9e260d60e249391bd37d646c2b90 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Nov 2021 09:51:20 -0800 Subject: [PATCH 19/53] CLN: split/fixturize to_datetime tests (#44367) --- pandas/tests/tools/test_to_datetime.py | 388 ++++++++++++------------- 1 file changed, 186 insertions(+), 202 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 1f75bc11005bc..4867ba58838ef 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -46,6 +46,14 @@ from pandas.core.tools.datetimes import start_caching_at +@pytest.fixture(params=[True, False]) +def cache(request): + """ + cache keyword to pass to to_datetime. + """ + return request.param + + class TestTimeConversionFormats: @pytest.mark.parametrize("readonly", [True, False]) def test_to_datetime_readonly(self, readonly): @@ -57,7 +65,6 @@ def test_to_datetime_readonly(self, readonly): expected = to_datetime([]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format(self, cache): values = ["1/1/2000", "1/2/2000", "1/3/2000"] @@ -82,7 +89,6 @@ def test_to_datetime_format(self, cache): else: tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_YYYYMMDD(self, cache): s = Series([19801222, 19801222] + [19810105] * 5) expected = Series([Timestamp(x) for x in s.apply(str)]) @@ -109,17 +115,18 @@ def test_to_datetime_format_YYYYMMDD(self, cache): result = to_datetime(s, format="%Y%m%d", cache=cache) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 - s = Series([20121231, 20141231, 99991231]) - result = to_datetime(s, format="%Y%m%d", errors="ignore", cache=cache) + ser = Series([20121231, 20141231, 99991231]) + result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache) expected = Series( [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], dtype=object, ) tm.assert_series_equal(result, expected) - result = to_datetime(s, format="%Y%m%d", errors="coerce", cache=cache) + result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") tm.assert_series_equal(result, expected) @@ -199,7 +206,6 @@ def test_to_datetime_with_NA(self, data, format, expected): result = to_datetime(data, format=format) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_integer(self, cache): # GH 10178 s = Series([2000, 2001, 2002]) @@ -236,7 +242,6 @@ def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, expected): result = to_datetime(int_date, format="%Y%m%d", errors="ignore") assert result == expected - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_microsecond(self, cache): # these are locale dependent @@ -249,7 +254,6 @@ def test_to_datetime_format_microsecond(self, cache): exp = datetime.strptime(val, format) assert result == exp - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_time(self, cache): data = [ ["01/10/2010 15:20", "%m/%d/%Y %H:%M", Timestamp("2010-01-10 15:20")], @@ -259,6 +263,7 @@ def test_to_datetime_format_time(self, cache): "%m/%d/%Y %H:%M:%S", Timestamp("2010-01-10 13:56:01"), ] # , + # FIXME: don't leave commented-out # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', # Timestamp('2010-01-10 20:14')], # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', @@ -270,7 +275,6 @@ def test_to_datetime_format_time(self, cache): assert to_datetime(s, format=format, cache=cache) == dt @td.skip_if_has_locale - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_non_exact(self, cache): # GH 10834 # 8904 @@ -284,7 +288,6 @@ def test_to_datetime_with_non_exact(self, cache): ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_parse_nanoseconds_with_formula(self, cache): # GH8989 @@ -300,14 +303,15 @@ def test_parse_nanoseconds_with_formula(self, cache): result = to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", cache=cache) assert result == expected - @pytest.mark.parametrize("cache", [True, False]) - def test_to_datetime_format_weeks(self, cache): - data = [ + @pytest.mark.parametrize( + "value,fmt,expected", + [ ["2009324", "%Y%W%w", Timestamp("2009-08-13")], ["2013020", "%Y%U%w", Timestamp("2013-01-13")], - ] - for s, format, dt in data: - assert to_datetime(s, format=format, cache=cache) == dt + ], + ) + def test_to_datetime_format_weeks(self, value, fmt, expected, cache): + assert to_datetime(value, format=fmt, cache=cache) == expected @pytest.mark.parametrize( "fmt,dates,expected_dates", @@ -601,7 +605,6 @@ def test_to_datetime_today_now_unicode_bytes(self): to_datetime(["now"]) to_datetime(["today"]) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_dt64s(self, cache): in_bound_dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] @@ -611,7 +614,6 @@ def test_to_datetime_dt64s(self, cache): @pytest.mark.parametrize( "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")] ) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_dt64s_out_of_bounds(self, cache, dt): msg = f"Out of bounds nanosecond timestamp: {dt}" with pytest.raises(OutOfBoundsDatetime, match=msg): @@ -620,7 +622,6 @@ def test_to_datetime_dt64s_out_of_bounds(self, cache, dt): Timestamp(dt) assert to_datetime(dt, errors="coerce", cache=cache) is NaT - @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize("unit", ["s", "D"]) def test_to_datetime_array_of_dt64s(self, cache, unit): # https://github.com/pandas-dev/pandas/issues/31491 @@ -659,7 +660,6 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): Index([dt.item() for dt in dts_with_oob]), ) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_tz(self, cache): # xref 8260 @@ -686,7 +686,6 @@ def test_to_datetime_tz(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(arr, cache=cache) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_different_offsets(self, cache): # inspired by asv timeseries.ToDatetimeNONISO8601 benchmark # see GH-26097 for more @@ -697,7 +696,6 @@ def test_to_datetime_different_offsets(self, cache): result = to_datetime(arr, cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_tz_pytz(self, cache): # see gh-8260 us_eastern = pytz.timezone("US/Eastern") @@ -720,19 +718,16 @@ def test_to_datetime_tz_pytz(self, cache): ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize( - "init_constructor, end_constructor, test_method", + "init_constructor, end_constructor", [ - (Index, DatetimeIndex, tm.assert_index_equal), - (list, DatetimeIndex, tm.assert_index_equal), - (np.array, DatetimeIndex, tm.assert_index_equal), - (Series, Series, tm.assert_series_equal), + (Index, DatetimeIndex), + (list, DatetimeIndex), + (np.array, DatetimeIndex), + (Series, Series), ], ) - def test_to_datetime_utc_true( - self, cache, init_constructor, end_constructor, test_method - ): + def test_to_datetime_utc_true(self, cache, init_constructor, end_constructor): # See gh-11934 & gh-6415 data = ["20100102 121314", "20100102 121315"] expected_data = [ @@ -744,14 +739,13 @@ def test_to_datetime_utc_true( init_constructor(data), format="%Y%m%d %H%M%S", utc=True, cache=cache ) expected = end_constructor(expected_data) - test_method(result, expected) + tm.assert_equal(result, expected) # Test scalar case as well for scalar, expected in zip(data, expected_data): result = to_datetime(scalar, format="%Y%m%d %H%M%S", utc=True, cache=cache) assert result == expected - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_utc_true_with_series_single_value(self, cache): # GH 15760 UTC=True with Series ts = 1.5e18 @@ -759,7 +753,6 @@ def test_to_datetime_utc_true_with_series_single_value(self, cache): expected = Series([Timestamp(ts, tz="utc")]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): ts = "2013-01-01 00:00:00-01:00" expected_ts = "2013-01-01 01:00:00" @@ -768,7 +761,6 @@ def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): expected = Series([Timestamp(expected_ts, tz="utc")] * 3) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize( "date, dtype", [ @@ -781,7 +773,6 @@ def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype): result = to_datetime(Series([date], dtype=dtype), utc=True, cache=cache) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) @td.skip_if_no("psycopg2") def test_to_datetime_tz_psycopg2(self, cache): @@ -822,7 +813,6 @@ def test_to_datetime_tz_psycopg2(self, cache): expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_datetime_bool(self, cache): # GH13176 msg = r"dtype bool cannot be converted to datetime64\[ns\]" @@ -945,18 +935,6 @@ def test_to_datetime_cache(self, utc, format, constructor): tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "listlike", - [ - (deque([Timestamp("2010-06-02 09:30:00")] * 51)), - ([Timestamp("2010-06-02 09:30:00")] * 51), - (tuple([Timestamp("2010-06-02 09:30:00")] * 51)), - ], - ) - def test_no_slicing_errors_in_should_cache(self, listlike): - # GH 29403 - assert tools.should_cache(listlike) is True - def test_to_datetime_from_deque(self): # GH 29403 result = to_datetime(deque([Timestamp("2010-06-02 09:30:00")] * 51)) @@ -1198,7 +1176,6 @@ def test_to_datetime_fixed_offset(self): class TestToDatetimeUnit: - @pytest.mark.parametrize("cache", [True, False]) def test_unit(self, cache): # GH 11758 # test proper behavior with errors @@ -1247,17 +1224,19 @@ def test_unit(self, cache): with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(values, errors="raise", unit="s", cache=cache) + def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache): # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - for val in ["foo", Timestamp("20130101")]: - try: - to_datetime(val, errors="raise", unit="s", cache=cache) - except OutOfBoundsDatetime as err: - raise AssertionError("incorrect exception raised") from err - except ValueError: - pass - - @pytest.mark.parametrize("cache", [True, False]) + + try: + to_datetime("foo", errors="raise", unit="s", cache=cache) + except OutOfBoundsDatetime as err: + raise AssertionError("incorrect exception raised") from err + except ValueError: + pass + else: + assert False, "Failed to raise ValueError" + def test_unit_consistency(self, cache): # consistency of conversions @@ -1274,7 +1253,6 @@ def test_unit_consistency(self, cache): assert result == expected assert isinstance(result, Timestamp) - @pytest.mark.parametrize("cache", [True, False]) def test_unit_with_numeric(self, cache): # GH 13180 @@ -1303,7 +1281,6 @@ def test_unit_with_numeric(self, cache): result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_unit_mixed(self, cache): # mixed integers/datetimes @@ -1324,7 +1301,6 @@ def test_unit_mixed(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(arr, errors="raise", cache=cache) - @pytest.mark.parametrize("cache", [True, False]) def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors # but no premature rounding @@ -1332,17 +1308,105 @@ def test_unit_rounding(self, cache): expected = Timestamp("2015-06-19 19:55:31.877000192") assert result == expected - @pytest.mark.parametrize("cache", [True, False]) def test_unit_ignore_keeps_name(self, cache): # GH 21697 expected = Index([15e9] * 2, name="name") result = to_datetime(expected, errors="ignore", unit="s", cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) - def test_dataframe(self, cache): + def test_to_datetime_errors_ignore_utc_true(self): + # GH#23758 + result = to_datetime([1], unit="s", utc=True, errors="ignore") + expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") + tm.assert_index_equal(result, expected) + + # TODO: this is moved from tests.series.test_timeseries, may be redundant + def test_to_datetime_unit(self): + + epoch = 1370745748 + s1 = Series([epoch + t for t in range(20)]) + s2 = Series([epoch + t for t in range(20)]).astype(float) + + for ser in [s1, s2]: + result = to_datetime(ser, unit="s") + expected = Series( + [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) + for t in range(20) + ] + ) + tm.assert_series_equal(result, expected) + + s1 = Series([epoch + t for t in range(20)] + [iNaT]) + s2 = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) + s3 = Series([epoch + t for t in range(20)] + [np.nan]) + + for ser in [s1, s2, s3]: + result = to_datetime(ser, unit="s") + expected = Series( + [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) + for t in range(20) + ] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + def test_to_datetime_unit_fractional_seconds(self): + + # GH13834 + epoch = 1370745748 + s = Series([epoch + t for t in np.arange(0, 2, 0.25)] + [iNaT]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) + for t in np.arange(0, 2, 0.25) + ] + + [NaT] + ) + # GH20455 argument will incur floating point errors but no premature rounding + result = result.round("ms") + tm.assert_series_equal(result, expected) + + def test_to_datetime_unit_na_values(self): + result = to_datetime([1, 2, "NaT", NaT, np.nan], unit="D") + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3 + ) + tm.assert_index_equal(result, expected) + + def test_to_datetime_unit_invalid(self): + msg = "non convertible value foo with the unit 'D'" + with pytest.raises(ValueError, match=msg): + to_datetime([1, 2, "foo"], unit="D") + msg = "cannot convert input 111111111 with the unit 'D'" + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime([1, 2, 111111111], unit="D") + + def test_to_timestamp_unit_coerce(self): + # coerce we can process + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 + ) + result = to_datetime([1, 2, "foo"], unit="D", errors="coerce") + tm.assert_index_equal(result, expected) - df = DataFrame( + result = to_datetime([1, 2, 111111111], unit="D", errors="coerce") + tm.assert_index_equal(result, expected) + + +class TestToDatetimeDataFrame: + @pytest.fixture(params=[True, False]) + def cache(self, request): + """ + cache keyword to pass to to_datetime. + """ + return request.param + + @pytest.fixture + def df(self): + return DataFrame( { "year": [2015, 2016], "month": [2, 3], @@ -1356,6 +1420,8 @@ def test_dataframe(self, cache): } ) + def test_dataframe(self, df, cache): + result = to_datetime( {"year": df["year"], "month": df["month"], "day": df["day"]}, cache=cache ) @@ -1377,6 +1443,7 @@ def test_dataframe(self, cache): ) tm.assert_series_equal(result, expected2) + def test_dataframe_field_aliases_column_subset(self, df, cache): # unit mappings units = [ { @@ -1404,6 +1471,7 @@ def test_dataframe(self, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_field_aliases(self, df, cache): d = { "year": "year", "month": "month", @@ -1425,10 +1493,18 @@ def test_dataframe(self, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_str_dtype(self, df, cache): # coerce back to int result = to_datetime(df.astype(str), cache=cache) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) tm.assert_series_equal(result, expected) + def test_dataframe_coerce(self, cache): # passing coerce df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) @@ -1438,10 +1514,12 @@ def test_dataframe(self, cache): ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) + result = to_datetime(df2, errors="coerce", cache=cache) expected = Series([Timestamp("20150204 00:00:00"), NaT]) tm.assert_series_equal(result, expected) + def test_dataframe_extra_keys_raisesm(self, df, cache): # extra columns msg = r"extra keys have been passed to the datetime assemblage: \[foo\]" with pytest.raises(ValueError, match=msg): @@ -1449,6 +1527,7 @@ def test_dataframe(self, cache): df2["foo"] = 1 to_datetime(df2, cache=cache) + def test_dataframe_missing_keys_raises(self, df, cache): # not enough msg = ( r"to assemble mappings requires at least that \[year, month, " @@ -1464,6 +1543,7 @@ def test_dataframe(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(df[c], cache=cache) + def test_dataframe_duplicate_columns_raises(self, cache): # duplicates msg = "cannot assemble with duplicate keys" df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) @@ -1478,9 +1558,8 @@ def test_dataframe(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) - @pytest.mark.parametrize("cache", [True, False]) def test_dataframe_dtypes(self, cache): - # #13451 + # GH#13451 df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) # int16 @@ -1506,7 +1585,7 @@ def test_dataframe_dtypes(self, cache): to_datetime(df, cache=cache) def test_dataframe_utc_true(self): - # GH 23760 + # GH#23760 df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = to_datetime(df, utc=True) expected = Series( @@ -1514,94 +1593,6 @@ def test_dataframe_utc_true(self): ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) - def test_to_datetime_errors_ignore_utc_true(self): - # GH 23758 - result = to_datetime([1], unit="s", utc=True, errors="ignore") - expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") - tm.assert_index_equal(result, expected) - - # TODO: this is moved from tests.series.test_timeseries, may be redundant - def test_to_datetime_unit(self): - - epoch = 1370745748 - s = Series([epoch + t for t in range(20)]) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - ) - tm.assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)]).astype(float) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - ) - tm.assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)] + [iNaT]) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - + [NaT] - ) - tm.assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - + [NaT] - ) - tm.assert_series_equal(result, expected) - - # GH13834 - s = Series([epoch + t for t in np.arange(0, 2, 0.25)] + [iNaT]).astype(float) - result = to_datetime(s, unit="s") - expected = Series( - [ - Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) - for t in np.arange(0, 2, 0.25) - ] - + [NaT] - ) - # GH20455 argument will incur floating point errors but no premature rounding - result = result.round("ms") - tm.assert_series_equal(result, expected) - - s = pd.concat( - [Series([epoch + t for t in range(20)]).astype(float), Series([np.nan])], - ignore_index=True, - ) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - + [NaT] - ) - tm.assert_series_equal(result, expected) - - result = to_datetime([1, 2, "NaT", NaT, np.nan], unit="D") - expected = DatetimeIndex( - [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3 - ) - tm.assert_index_equal(result, expected) - - msg = "non convertible value foo with the unit 'D'" - with pytest.raises(ValueError, match=msg): - to_datetime([1, 2, "foo"], unit="D") - msg = "cannot convert input 111111111 with the unit 'D'" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime([1, 2, 111111111], unit="D") - - # coerce we can process - expected = DatetimeIndex( - [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 - ) - result = to_datetime([1, 2, "foo"], unit="D", errors="coerce") - tm.assert_index_equal(result, expected) - - result = to_datetime([1, 2, 111111111], unit="D", errors="coerce") - tm.assert_index_equal(result, expected) - class TestToDatetimeMisc: def test_to_datetime_barely_out_of_bounds(self): @@ -1614,7 +1605,6 @@ def test_to_datetime_barely_out_of_bounds(self): with pytest.raises(OutOfBoundsDatetime, match=msg): to_datetime(arr) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601(self, cache): result = to_datetime(["2012-01-01 00:00:00"], cache=cache) exp = Timestamp("2012-01-01 00:00:00") @@ -1624,19 +1614,17 @@ def test_to_datetime_iso8601(self, cache): exp = Timestamp("2012-10-01") assert result[0] == exp - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_default(self, cache): rs = to_datetime("2001", cache=cache) xp = datetime(2001, 1, 1) assert rs == xp # dayfirst is essentially broken - + # FIXME: don't leave commented-out # to_datetime('01-13-2012', dayfirst=True) # pytest.raises(ValueError, to_datetime('01-13-2012', # dayfirst=True)) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_on_datetime64_series(self, cache): # #2699 s = Series(date_range("1/1/2000", periods=10)) @@ -1644,7 +1632,6 @@ def test_to_datetime_on_datetime64_series(self, cache): result = to_datetime(s, cache=cache) assert result[0] == s[0] - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_space_in_series(self, cache): # GH 6428 s = Series(["10/18/2006", "10/18/2008", " "]) @@ -1658,7 +1645,6 @@ def test_to_datetime_with_space_in_series(self, cache): tm.assert_series_equal(result_ignore, s) @td.skip_if_has_locale - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_apply(self, cache): # this is only locale tested with US/None locales # GH 5195 @@ -1681,7 +1667,6 @@ def test_to_datetime_with_apply(self, cache): ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_types(self, cache): # empty string @@ -1701,18 +1686,19 @@ def test_to_datetime_types(self, cache): result = to_datetime("2012", cache=cache) assert result == expected + # FIXME: don't leave commented-out # array = ['2012','20120101','20120101 12:01:01'] array = ["20120101", "20120101 12:01:01"] expected = list(to_datetime(array, cache=cache)) result = [Timestamp(date_str) for date_str in array] tm.assert_almost_equal(result, expected) + # FIXME: don't leave commented-out # currently fails ### # result = Timestamp('2012') # expected = to_datetime('2012') # assert result == expected - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 @@ -1724,7 +1710,6 @@ def test_to_datetime_unprocessable_input(self, cache): with pytest.raises(TypeError, match=msg): to_datetime([1, "1"], errors="raise", cache=cache) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_unhashable_input(self, cache): series = Series([["a"]] * 100) result = to_datetime(series, errors="ignore", cache=cache) @@ -1765,7 +1750,6 @@ def test_to_datetime_overflow(self): with pytest.raises(OutOfBoundsTimedelta, match=msg): date_range(start="1/1/1700", freq="B", periods=100000) - @pytest.mark.parametrize("cache", [True, False]) def test_string_na_nat_conversion(self, cache): # GH #999, #858 @@ -1846,7 +1830,6 @@ def test_string_na_nat_conversion(self, cache): "datetime64[ns]", ], ) - @pytest.mark.parametrize("cache", [True, False]) def test_dti_constructor_numpy_timeunits(self, cache, dtype): # GH 9114 base = to_datetime(["2000-01-01T00:00", "2000-01-02T00:00", "NaT"], cache=cache) @@ -1856,7 +1839,6 @@ def test_dti_constructor_numpy_timeunits(self, cache, dtype): tm.assert_index_equal(DatetimeIndex(values), base) tm.assert_index_equal(to_datetime(values, cache=cache), base) - @pytest.mark.parametrize("cache", [True, False]) def test_dayfirst(self, cache): # GH 5917 arr = ["10/02/2014", "11/02/2014", "12/02/2014"] @@ -1980,7 +1962,6 @@ def test_guess_datetime_format_for_array(self): class TestToDatetimeInferFormat: - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_consistent_format(self, cache): s = Series(date_range("20000101", periods=50, freq="H")) @@ -2002,7 +1983,6 @@ def test_to_datetime_infer_datetime_format_consistent_format(self, cache): tm.assert_series_equal(with_format, no_infer) tm.assert_series_equal(no_infer, yes_infer) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): s = Series( np.array( @@ -2024,7 +2004,6 @@ def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): to_datetime(s, infer_datetime_format=True, cache=cache), ) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): s = Series( np.array( @@ -2037,7 +2016,6 @@ def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): to_datetime(s, infer_datetime_format=True, cache=cache), ) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): s = Series( np.array( @@ -2086,7 +2064,6 @@ def test_infer_datetime_format_zero_tz(self, ts, zero_tz, is_utc): expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 s = Series(["2014-1-1", "2014-2-2", "2015-3-3"]) @@ -2104,7 +2081,6 @@ def test_to_datetime_iso8601_noleading_0s(self, cache): class TestDaysInMonth: # tests for issue #10154 - @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_coerce(self, cache): assert isna(to_datetime("2015-02-29", errors="coerce", cache=cache)) assert isna( @@ -2117,7 +2093,6 @@ def test_day_not_in_month_coerce(self, cache): to_datetime("2015-04-31", format="%Y-%m-%d", errors="coerce", cache=cache) ) - @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_raise(self, cache): msg = "day is out of range for month" with pytest.raises(ValueError, match=msg): @@ -2135,7 +2110,6 @@ def test_day_not_in_month_raise(self, cache): with pytest.raises(ValueError, match=msg): to_datetime("2015-04-31", errors="raise", format="%Y-%m-%d", cache=cache) - @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_ignore(self, cache): assert to_datetime("2015-02-29", errors="ignore", cache=cache) == "2015-02-29" assert ( @@ -2205,7 +2179,6 @@ class TestDatetimeParsingWrappers: }.items() ), ) - @pytest.mark.parametrize("cache", [True, False]) def test_parsers(self, date_str, expected, cache): # dateutil >= 2.5.0 defaults to yearfirst=True @@ -2237,7 +2210,6 @@ def test_parsers(self, date_str, expected, cache): result7 = date_range(date_str, freq="S", periods=1, yearfirst=yearfirst) assert result7 == expected - @pytest.mark.parametrize("cache", [True, False]) def test_na_values_with_cache( self, cache, unique_nulls_fixture, unique_nulls_fixture2 ): @@ -2257,7 +2229,6 @@ def test_parsers_nat(self): assert result3 is NaT assert result4 is NaT - @pytest.mark.parametrize("cache", [True, False]) def test_parsers_dayfirst_yearfirst(self, cache): # OK # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 @@ -2345,7 +2316,6 @@ def test_parsers_dayfirst_yearfirst(self, cache): assert result3 == expected assert result4 == expected - @pytest.mark.parametrize("cache", [True, False]) def test_parsers_timestring(self, cache): # must be the same as dateutil result cases = { @@ -2368,7 +2338,6 @@ def test_parsers_timestring(self, cache): assert result4 == exp_now assert result5 == exp_now - @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize( "dt_string, tz, dt_string_repr", [ @@ -2564,29 +2533,44 @@ def test_arg_tz_ns_unit(self, offset, utc, exp): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize( - "listlike,do_caching", - [([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)], -) -def test_should_cache(listlike, do_caching): - assert ( - tools.should_cache(listlike, check_count=len(listlike), unique_share=0.7) - == do_caching +class TestShouldCache: + @pytest.mark.parametrize( + "listlike,do_caching", + [ + ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), + ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True), + ], ) + def test_should_cache(self, listlike, do_caching): + assert ( + tools.should_cache(listlike, check_count=len(listlike), unique_share=0.7) + == do_caching + ) + @pytest.mark.parametrize( + "unique_share,check_count, err_message", + [ + (0.5, 11, r"check_count must be in next bounds: \[0; len\(arg\)\]"), + (10, 2, r"unique_share must be in next bounds: \(0; 1\)"), + ], + ) + def test_should_cache_errors(self, unique_share, check_count, err_message): + arg = [5] * 10 -@pytest.mark.parametrize( - "unique_share,check_count, err_message", - [ - (0.5, 11, r"check_count must be in next bounds: \[0; len\(arg\)\]"), - (10, 2, r"unique_share must be in next bounds: \(0; 1\)"), - ], -) -def test_should_cache_errors(unique_share, check_count, err_message): - arg = [5] * 10 + with pytest.raises(AssertionError, match=err_message): + tools.should_cache(arg, unique_share, check_count) - with pytest.raises(AssertionError, match=err_message): - tools.should_cache(arg, unique_share, check_count) + @pytest.mark.parametrize( + "listlike", + [ + (deque([Timestamp("2010-06-02 09:30:00")] * 51)), + ([Timestamp("2010-06-02 09:30:00")] * 51), + (tuple([Timestamp("2010-06-02 09:30:00")] * 51)), + ], + ) + def test_no_slicing_errors_in_should_cache(self, listlike): + # GH#29403 + assert tools.should_cache(listlike) is True def test_nullable_integer_to_datetime(): @@ -2624,7 +2608,7 @@ def test_na_to_datetime(nulls_fixture, klass): assert result[0] is NaT -def test_empty_string_datetime_coerce__format(): +def test_empty_string_datetime_coerce_format(): # GH13044 td = Series(["03/24/2016", "03/25/2016", ""]) format = "%m/%d/%Y" From a23f5df970445a08c2215b51f6fd64c0e2840860 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Nov 2021 21:10:23 +0100 Subject: [PATCH 20/53] TST: use custom parametrization for consistency in base extension array tests (#44332) --- pandas/tests/extension/base/ops.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index c52f20255eb81..1d3d736ca7ee2 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -162,13 +162,12 @@ def test_compare_array(self, data, comparison_op): other = pd.Series([data[0]] * len(data)) self._compare_other(ser, data, comparison_op, other) - def test_direct_arith_with_ndframe_returns_not_implemented( - self, data, frame_or_series - ): + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): # EAs should return NotImplemented for ops with Series/DataFrame # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) - if frame_or_series is pd.DataFrame: + if box is pd.DataFrame: other = other.to_frame() if hasattr(data, "__eq__"): From 7a31ca8d979e6f96cc3a62597279c30495ede2b7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Nov 2021 13:02:18 -0800 Subject: [PATCH 21/53] TST: parametrize arithmetic tests (#44395) --- pandas/tests/arithmetic/common.py | 33 +++- pandas/tests/arithmetic/test_datetime64.py | 104 +++------- pandas/tests/arithmetic/test_numeric.py | 21 +-- pandas/tests/arithmetic/test_object.py | 28 +-- pandas/tests/arithmetic/test_period.py | 198 +++++++++----------- pandas/tests/arithmetic/test_timedelta64.py | 18 +- 6 files changed, 175 insertions(+), 227 deletions(-) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index af70cdfe538bb..f3173e8f0eb57 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -11,7 +11,26 @@ array, ) import pandas._testing as tm -from pandas.core.arrays import PandasArray +from pandas.core.arrays import ( + BooleanArray, + PandasArray, +) + + +def assert_cannot_add(left, right, msg="cannot add"): + """ + Helper to assert that left and right cannot be added. + + Parameters + ---------- + left : object + right : object + msg : str, default "cannot add" + """ + with pytest.raises(TypeError, match=msg): + left + right + with pytest.raises(TypeError, match=msg): + right + left def assert_invalid_addsub_type(left, right, msg=None): @@ -79,21 +98,29 @@ def xbox2(x): # just exclude PandasArray[bool] if isinstance(x, PandasArray): return x._ndarray + if isinstance(x, BooleanArray): + # NB: we are assuming no pd.NAs for now + return x.astype(bool) return x + # rev_box: box to use for reversed comparisons + rev_box = xbox + if isinstance(right, Index) and isinstance(left, Series): + rev_box = np.array + result = xbox2(left == right) expected = xbox(np.zeros(result.shape, dtype=np.bool_)) tm.assert_equal(result, expected) result = xbox2(right == left) - tm.assert_equal(result, expected) + tm.assert_equal(result, rev_box(expected)) result = xbox2(left != right) tm.assert_equal(result, ~expected) result = xbox2(right != left) - tm.assert_equal(result, ~expected) + tm.assert_equal(result, rev_box(~expected)) msg = "|".join( [ diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index bff461dbc7038..87bbdfb3c808f 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -41,6 +41,7 @@ ) from pandas.core.ops import roperator from pandas.tests.arithmetic.common import ( + assert_cannot_add, assert_invalid_addsub_type, assert_invalid_comparison, get_upcast_box, @@ -99,6 +100,7 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_arra @pytest.mark.parametrize( "other", [ + # GH#4968 invalid date/int comparisons list(range(10)), np.arange(10), np.arange(10).astype(np.float32), @@ -111,13 +113,14 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_arra pd.period_range("1971-01-01", freq="D", periods=10).astype(object), ], ) - def test_dt64arr_cmp_arraylike_invalid(self, other, tz_naive_fixture): - # We don't parametrize this over box_with_array because listlike - # other plays poorly with assert_invalid_comparison reversed checks + def test_dt64arr_cmp_arraylike_invalid( + self, other, tz_naive_fixture, box_with_array + ): tz = tz_naive_fixture dta = date_range("1970-01-01", freq="ns", periods=10, tz=tz)._data - assert_invalid_comparison(dta, other, tm.to_array) + obj = tm.box_expected(dta, box_with_array) + assert_invalid_comparison(obj, other, box_with_array) def test_dt64arr_cmp_mixed_invalid(self, tz_naive_fixture): tz = tz_naive_fixture @@ -215,18 +218,6 @@ def test_nat_comparisons( tm.assert_series_equal(result, expected) - def test_comparison_invalid(self, tz_naive_fixture, box_with_array): - # GH#4968 - # invalid date/int comparisons - tz = tz_naive_fixture - ser = Series(range(5)) - ser2 = Series(date_range("20010101", periods=5, tz=tz)) - - ser = tm.box_expected(ser, box_with_array) - ser2 = tm.box_expected(ser2, box_with_array) - - assert_invalid_comparison(ser, ser2, box_with_array) - @pytest.mark.parametrize( "data", [ @@ -315,8 +306,8 @@ def test_timestamp_compare_series(self, left, right): tm.assert_series_equal(result, expected) # Compare to NaT with series containing NaT - expected = left_f(s_nat, Timestamp("nat")) - result = right_f(Timestamp("nat"), s_nat) + expected = left_f(s_nat, NaT) + result = right_f(NaT, s_nat) tm.assert_series_equal(result, expected) def test_dt64arr_timestamp_equality(self, box_with_array): @@ -832,17 +823,6 @@ def test_dt64arr_add_timedeltalike_scalar( result = rng + two_hours tm.assert_equal(result, expected) - def test_dt64arr_iadd_timedeltalike_scalar( - self, tz_naive_fixture, two_hours, box_with_array - ): - tz = tz_naive_fixture - - rng = date_range("2000-01-01", "2000-02-01", tz=tz) - expected = date_range("2000-01-01 02:00", "2000-02-01 02:00", tz=tz) - - rng = tm.box_expected(rng, box_with_array) - expected = tm.box_expected(expected, box_with_array) - rng += two_hours tm.assert_equal(rng, expected) @@ -860,17 +840,6 @@ def test_dt64arr_sub_timedeltalike_scalar( result = rng - two_hours tm.assert_equal(result, expected) - def test_dt64arr_isub_timedeltalike_scalar( - self, tz_naive_fixture, two_hours, box_with_array - ): - tz = tz_naive_fixture - - rng = date_range("2000-01-01", "2000-02-01", tz=tz) - expected = date_range("1999-12-31 22:00", "2000-01-31 22:00", tz=tz) - - rng = tm.box_expected(rng, box_with_array) - expected = tm.box_expected(expected, box_with_array) - rng -= two_hours tm.assert_equal(rng, expected) @@ -1071,21 +1040,14 @@ def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, box_with_array): dt64vals = dti.values dtarr = tm.box_expected(dti, box_with_array) - msg = "cannot add" - with pytest.raises(TypeError, match=msg): - dtarr + dt64vals - with pytest.raises(TypeError, match=msg): - dt64vals + dtarr + assert_cannot_add(dtarr, dt64vals) def test_dt64arr_add_timestamp_raises(self, box_with_array): # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 idx = DatetimeIndex(["2011-01-01", "2011-01-02"]) + ts = idx[0] idx = tm.box_expected(idx, box_with_array) - msg = "cannot add" - with pytest.raises(TypeError, match=msg): - idx + Timestamp("2011-01-01") - with pytest.raises(TypeError, match=msg): - Timestamp("2011-01-01") + idx + assert_cannot_add(idx, ts) # ------------------------------------------------------------- # Other Invalid Addition/Subtraction @@ -1267,13 +1229,12 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): dates = tm.box_expected(dates, box_with_array) expected = tm.box_expected(expected, box_with_array) - # TODO: parametrize over the scalar being added? radd? sub? - offset = dates + pd.offsets.Hour(5) - tm.assert_equal(offset, expected) - offset = dates + np.timedelta64(5, "h") - tm.assert_equal(offset, expected) - offset = dates + timedelta(hours=5) - tm.assert_equal(offset, expected) + # TODO: sub? + for scalar in [pd.offsets.Hour(5), np.timedelta64(5, "h"), timedelta(hours=5)]: + offset = dates + scalar + tm.assert_equal(offset, expected) + offset = scalar + dates + tm.assert_equal(offset, expected) # ------------------------------------------------------------- # RelativeDelta DateOffsets @@ -1941,8 +1902,7 @@ def test_dt64_mul_div_numeric_invalid(self, one, dt64_series): one / dt64_series # TODO: parametrize over box - @pytest.mark.parametrize("op", ["__add__", "__radd__", "__sub__", "__rsub__"]) - def test_dt64_series_add_intlike(self, tz_naive_fixture, op): + def test_dt64_series_add_intlike(self, tz_naive_fixture): # GH#19123 tz = tz_naive_fixture dti = DatetimeIndex(["2016-01-02", "2016-02-03", "NaT"], tz=tz) @@ -1950,21 +1910,16 @@ def test_dt64_series_add_intlike(self, tz_naive_fixture, op): other = Series([20, 30, 40], dtype="uint8") - method = getattr(ser, op) msg = "|".join( [ "Addition/subtraction of integers and integer-arrays", "cannot subtract .* from ndarray", ] ) - with pytest.raises(TypeError, match=msg): - method(1) - with pytest.raises(TypeError, match=msg): - method(other) - with pytest.raises(TypeError, match=msg): - method(np.array(other)) - with pytest.raises(TypeError, match=msg): - method(pd.Index(other)) + assert_invalid_addsub_type(ser, 1, msg) + assert_invalid_addsub_type(ser, other, msg) + assert_invalid_addsub_type(ser, np.array(other), msg) + assert_invalid_addsub_type(ser, pd.Index(other), msg) # ------------------------------------------------------------- # Timezone-Centric Tests @@ -2062,7 +2017,9 @@ def test_dti_add_intarray_tick(self, int_holder, freq): dti = date_range("2016-01-01", periods=2, freq=freq) other = int_holder([4, -1]) - msg = "Addition/subtraction of integers|cannot subtract DatetimeArray from" + msg = "|".join( + ["Addition/subtraction of integers", "cannot subtract DatetimeArray from"] + ) assert_invalid_addsub_type(dti, other, msg) @pytest.mark.parametrize("freq", ["W", "M", "MS", "Q"]) @@ -2072,7 +2029,9 @@ def test_dti_add_intarray_non_tick(self, int_holder, freq): dti = date_range("2016-01-01", periods=2, freq=freq) other = int_holder([4, -1]) - msg = "Addition/subtraction of integers|cannot subtract DatetimeArray from" + msg = "|".join( + ["Addition/subtraction of integers", "cannot subtract DatetimeArray from"] + ) assert_invalid_addsub_type(dti, other, msg) @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) @@ -2222,10 +2181,7 @@ def test_add_datetimelike_and_dtarr(self, box_with_array, addend, tz): dtarr = tm.box_expected(dti, box_with_array) msg = "cannot add DatetimeArray and" - with pytest.raises(TypeError, match=msg): - dtarr + addend - with pytest.raises(TypeError, match=msg): - addend + dtarr + assert_cannot_add(dtarr, addend, msg) # ------------------------------------------------------------- diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 9932adccdbaf2..3bf5fdb257c2a 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -29,6 +29,7 @@ UInt64Index, ) from pandas.core.computation import expressions as expr +from pandas.tests.arithmetic.common import assert_invalid_comparison @pytest.fixture(params=[Index, Series, tm.to_array]) @@ -84,25 +85,13 @@ def test_operator_series_comparison_zerorank(self): expected = 0.0 > Series([1, 2, 3]) tm.assert_series_equal(result, expected) - def test_df_numeric_cmp_dt64_raises(self): + def test_df_numeric_cmp_dt64_raises(self, box_with_array): # GH#8932, GH#22163 ts = pd.Timestamp.now() - df = pd.DataFrame({"x": range(5)}) + obj = np.array(range(5)) + obj = tm.box_expected(obj, box_with_array) - msg = ( - "'[<>]' not supported between instances of 'numpy.ndarray' and 'Timestamp'" - ) - with pytest.raises(TypeError, match=msg): - df > ts - with pytest.raises(TypeError, match=msg): - df < ts - with pytest.raises(TypeError, match=msg): - ts < df - with pytest.raises(TypeError, match=msg): - ts > df - - assert not (df == ts).any().any() - assert (df != ts).all().all() + assert_invalid_comparison(obj, ts, box_with_array) def test_compare_invalid(self): # GH#8058 diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 9a586fd553428..3069868ebb677 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -21,17 +21,15 @@ class TestObjectComparisons: - def test_comparison_object_numeric_nas(self): + def test_comparison_object_numeric_nas(self, comparison_op): ser = Series(np.random.randn(10), dtype=object) shifted = ser.shift(2) - ops = ["lt", "le", "gt", "ge", "eq", "ne"] - for op in ops: - func = getattr(operator, op) + func = comparison_op - result = func(ser, shifted) - expected = func(ser.astype(float), shifted.astype(float)) - tm.assert_series_equal(result, expected) + result = func(ser, shifted) + expected = func(ser.astype(float), shifted.astype(float)) + tm.assert_series_equal(result, expected) def test_object_comparisons(self): ser = Series(["a", "b", np.nan, "c", "a"]) @@ -141,11 +139,13 @@ def test_objarr_radd_str_invalid(self, dtype, data, box_with_array): ser = Series(data, dtype=dtype) ser = tm.box_expected(ser, box_with_array) - msg = ( - "can only concatenate str|" - "did not contain a loop with signature matching types|" - "unsupported operand type|" - "must be str" + msg = "|".join( + [ + "can only concatenate str", + "did not contain a loop with signature matching types", + "unsupported operand type", + "must be str", + ] ) with pytest.raises(TypeError, match=msg): "foo_" + ser @@ -159,7 +159,9 @@ def test_objarr_add_invalid(self, op, box_with_array): obj_ser.name = "objects" obj_ser = tm.box_expected(obj_ser, box) - msg = "can only concatenate str|unsupported operand type|must be str" + msg = "|".join( + ["can only concatenate str", "unsupported operand type", "must be str"] + ) with pytest.raises(Exception, match=msg): op(obj_ser, 1) with pytest.raises(Exception, match=msg): diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index f8814a33292ec..f4404a3483e6f 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -26,6 +26,7 @@ from pandas.core import ops from pandas.core.arrays import TimedeltaArray from pandas.tests.arithmetic.common import ( + assert_invalid_addsub_type, assert_invalid_comparison, get_upcast_box, ) @@ -39,6 +40,20 @@ class TestPeriodArrayLikeComparisons: # DataFrame/Series/PeriodIndex/PeriodArray. Ideally all comparison # tests will eventually end up here. + @pytest.mark.parametrize("other", ["2017", Period("2017", freq="D")]) + def test_eq_scalar(self, other, box_with_array): + + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") + idx = tm.box_expected(idx, box_with_array) + xbox = get_upcast_box(idx, other, True) + + expected = np.array([True, True, False]) + expected = tm.box_expected(expected, xbox) + + result = idx == other + + tm.assert_equal(result, expected) + def test_compare_zerodim(self, box_with_array): # GH#26689 make sure we unbox zero-dimensional arrays @@ -54,9 +69,20 @@ def test_compare_zerodim(self, box_with_array): tm.assert_equal(result, expected) @pytest.mark.parametrize( - "scalar", ["foo", Timestamp.now(), Timedelta(days=4), 9, 9.5] + "scalar", + [ + "foo", + Timestamp.now(), + Timedelta(days=4), + 9, + 9.5, + 2000, # specifically don't consider 2000 to match Period("2000", "D") + False, + None, + ], ) def test_compare_invalid_scalar(self, box_with_array, scalar): + # GH#28980 # comparison with scalar that cannot be interpreted as a Period pi = period_range("2000", periods=4) parr = tm.box_expected(pi, box_with_array) @@ -70,6 +96,11 @@ def test_compare_invalid_scalar(self, box_with_array, scalar): np.arange(4), np.arange(4).astype(np.float64), list(range(4)), + # match Period semantics by not treating integers as Periods + [2000, 2001, 2002, 2003], + np.arange(2000, 2004), + np.arange(2000, 2004).astype(object), + pd.Index([2000, 2001, 2002, 2003]), ], ) def test_compare_invalid_listlike(self, box_with_array, other): @@ -138,68 +169,27 @@ def test_compare_object_dtype(self, box_with_array, other_box): class TestPeriodIndexComparisons: # TODO: parameterize over boxes - @pytest.mark.parametrize("other", ["2017", Period("2017", freq="D")]) - def test_eq(self, other): - idx = PeriodIndex(["2017", "2017", "2018"], freq="D") - expected = np.array([True, True, False]) - result = idx == other - - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize( - "other", - [ - 2017, - [2017, 2017, 2017], - np.array([2017, 2017, 2017]), - np.array([2017, 2017, 2017], dtype=object), - pd.Index([2017, 2017, 2017]), - ], - ) - def test_eq_integer_disallowed(self, other): - # match Period semantics by not treating integers as Periods - - idx = PeriodIndex(["2017", "2017", "2018"], freq="D") - expected = np.array([False, False, False]) - result = idx == other - - tm.assert_numpy_array_equal(result, expected) - msg = "|".join( - [ - "not supported between instances of 'Period' and 'int'", - r"Invalid comparison between dtype=period\[D\] and ", - ] - ) - with pytest.raises(TypeError, match=msg): - idx < other - with pytest.raises(TypeError, match=msg): - idx > other - with pytest.raises(TypeError, match=msg): - idx <= other - with pytest.raises(TypeError, match=msg): - idx >= other - def test_pi_cmp_period(self): idx = period_range("2007-01", periods=20, freq="M") + per = idx[10] - result = idx < idx[10] + result = idx < per exp = idx.values < idx.values[10] tm.assert_numpy_array_equal(result, exp) # Tests Period.__richcmp__ against ndarray[object, ndim=2] - result = idx.values.reshape(10, 2) < idx[10] + result = idx.values.reshape(10, 2) < per tm.assert_numpy_array_equal(result, exp.reshape(10, 2)) # Tests Period.__richcmp__ against ndarray[object, ndim=0] - result = idx < np.array(idx[10]) + result = idx < np.array(per) tm.assert_numpy_array_equal(result, exp) # TODO: moved from test_datetime64; de-duplicate with version below def test_parr_cmp_period_scalar2(self, box_with_array): pi = period_range("2000-01-01", periods=10, freq="D") - val = Period("2000-01-04", freq="D") - + val = pi[3] expected = [x > val for x in pi] ser = tm.box_expected(pi, box_with_array) @@ -326,23 +316,24 @@ def test_parr_cmp_pi_mismatched_freq(self, freq, box_with_array): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_pi_cmp_nat(self, freq): idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) + per = idx1[1] - result = idx1 > Period("2011-02", freq=freq) + result = idx1 > per exp = np.array([False, False, False, True]) tm.assert_numpy_array_equal(result, exp) - result = Period("2011-02", freq=freq) < idx1 + result = per < idx1 tm.assert_numpy_array_equal(result, exp) - result = idx1 == Period("NaT", freq=freq) + result = idx1 == pd.NaT exp = np.array([False, False, False, False]) tm.assert_numpy_array_equal(result, exp) - result = Period("NaT", freq=freq) == idx1 + result = pd.NaT == idx1 tm.assert_numpy_array_equal(result, exp) - result = idx1 != Period("NaT", freq=freq) + result = idx1 != pd.NaT exp = np.array([True, True, True, True]) tm.assert_numpy_array_equal(result, exp) - result = Period("NaT", freq=freq) != idx1 + result = pd.NaT != idx1 tm.assert_numpy_array_equal(result, exp) idx2 = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq=freq) @@ -475,28 +466,29 @@ def test_pi_comp_period(self): idx = PeriodIndex( ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" ) + per = idx[2] - f = lambda x: x == Period("2011-03", freq="M") + f = lambda x: x == per exp = np.array([False, False, True, False], dtype=np.bool_) self._check(idx, f, exp) - f = lambda x: Period("2011-03", freq="M") == x + f = lambda x: per == x self._check(idx, f, exp) - f = lambda x: x != Period("2011-03", freq="M") + f = lambda x: x != per exp = np.array([True, True, False, True], dtype=np.bool_) self._check(idx, f, exp) - f = lambda x: Period("2011-03", freq="M") != x + f = lambda x: per != x self._check(idx, f, exp) - f = lambda x: Period("2011-03", freq="M") >= x + f = lambda x: per >= x exp = np.array([True, True, True, False], dtype=np.bool_) self._check(idx, f, exp) - f = lambda x: x > Period("2011-03", freq="M") + f = lambda x: x > per exp = np.array([False, False, False, True], dtype=np.bool_) self._check(idx, f, exp) - f = lambda x: Period("2011-03", freq="M") >= x + f = lambda x: per >= x exp = np.array([True, True, True, False], dtype=np.bool_) self._check(idx, f, exp) @@ -504,11 +496,12 @@ def test_pi_comp_period_nat(self): idx = PeriodIndex( ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" ) + per = idx[2] - f = lambda x: x == Period("2011-03", freq="M") + f = lambda x: x == per exp = np.array([False, False, True, False], dtype=np.bool_) self._check(idx, f, exp) - f = lambda x: Period("2011-03", freq="M") == x + f = lambda x: per == x self._check(idx, f, exp) f = lambda x: x == pd.NaT @@ -517,10 +510,10 @@ def test_pi_comp_period_nat(self): f = lambda x: pd.NaT == x self._check(idx, f, exp) - f = lambda x: x != Period("2011-03", freq="M") + f = lambda x: x != per exp = np.array([True, True, False, True], dtype=np.bool_) self._check(idx, f, exp) - f = lambda x: Period("2011-03", freq="M") != x + f = lambda x: per != x self._check(idx, f, exp) f = lambda x: x != pd.NaT @@ -529,11 +522,11 @@ def test_pi_comp_period_nat(self): f = lambda x: pd.NaT != x self._check(idx, f, exp) - f = lambda x: Period("2011-03", freq="M") >= x + f = lambda x: per >= x exp = np.array([True, False, True, False], dtype=np.bool_) self._check(idx, f, exp) - f = lambda x: x < Period("2011-03", freq="M") + f = lambda x: x < per exp = np.array([True, False, False, False], dtype=np.bool_) self._check(idx, f, exp) @@ -696,20 +689,6 @@ def test_sub_n_gt_1_offsets(self, offset, kwd_name, n): # ------------------------------------------------------------- # Invalid Operations - @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) - @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) - def test_parr_add_sub_float_raises(self, op, other, box_with_array): - dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") - pi = dti.to_period("D") - pi = tm.box_expected(pi, box_with_array) - msg = ( - r"unsupported operand type\(s\) for [+-]: .* and .*|" - "Concatenation operation is not implemented for NumPy arrays" - ) - - with pytest.raises(TypeError, match=msg): - op(pi, other) - @pytest.mark.parametrize( "other", [ @@ -723,6 +702,8 @@ def test_parr_add_sub_float_raises(self, op, other, box_with_array): pd.date_range("2016-01-01", periods=3, freq="S")._data, pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, # Miscellaneous invalid types + 3.14, + np.array([2.0, 3.0, 4.0]), ], ) def test_parr_add_sub_invalid(self, other, box_with_array): @@ -730,11 +711,15 @@ def test_parr_add_sub_invalid(self, other, box_with_array): rng = period_range("1/1/2000", freq="D", periods=3) rng = tm.box_expected(rng, box_with_array) - msg = ( - r"(:?cannot add PeriodArray and .*)" - r"|(:?cannot subtract .* from (:?a\s)?.*)" - r"|(:?unsupported operand type\(s\) for \+: .* and .*)" + msg = "|".join( + [ + r"(:?cannot add PeriodArray and .*)", + r"(:?cannot subtract .* from (:?a\s)?.*)", + r"(:?unsupported operand type\(s\) for \+: .* and .*)", + r"unsupported operand type\(s\) for [+-]: .* and .*", + ] ) + assert_invalid_addsub_type(rng, other, msg) with pytest.raises(TypeError, match=msg): rng + other with pytest.raises(TypeError, match=msg): @@ -1034,9 +1019,11 @@ def test_pi_add_timedeltalike_minute_gt1(self, three_days): result = rng - other tm.assert_index_equal(result, expected) - msg = ( - r"(:?bad operand type for unary -: 'PeriodArray')" - r"|(:?cannot subtract PeriodArray from timedelta64\[[hD]\])" + msg = "|".join( + [ + r"(:?bad operand type for unary -: 'PeriodArray')", + r"(:?cannot subtract PeriodArray from timedelta64\[[hD]\])", + ] ) with pytest.raises(TypeError, match=msg): other - rng @@ -1261,7 +1248,7 @@ def test_parr_add_sub_object_array(self): class TestPeriodSeriesArithmetic: - def test_ops_series_timedelta(self): + def test_parr_add_timedeltalike_scalar(self, three_days, box_with_array): # GH#13043 ser = Series( [Period("2015-01-01", freq="D"), Period("2015-01-02", freq="D")], @@ -1270,21 +1257,18 @@ def test_ops_series_timedelta(self): assert ser.dtype == "Period[D]" expected = Series( - [Period("2015-01-02", freq="D"), Period("2015-01-03", freq="D")], + [Period("2015-01-04", freq="D"), Period("2015-01-05", freq="D")], name="xxx", ) - result = ser + Timedelta("1 days") - tm.assert_series_equal(result, expected) - - result = Timedelta("1 days") + ser - tm.assert_series_equal(result, expected) + obj = tm.box_expected(ser, box_with_array) + expected = tm.box_expected(expected, box_with_array) - result = ser + pd.tseries.offsets.Day() - tm.assert_series_equal(result, expected) + result = obj + three_days + tm.assert_equal(result, expected) - result = pd.tseries.offsets.Day() + ser - tm.assert_series_equal(result, expected) + result = three_days + obj + tm.assert_equal(result, expected) def test_ops_series_period(self): # GH#13043 @@ -1368,9 +1352,13 @@ def test_parr_ops_errors(self, ng, func, box_with_array): ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" ) obj = tm.box_expected(idx, box_with_array) - msg = ( - r"unsupported operand type\(s\)|can only concatenate|" - r"must be str|object to str implicitly" + msg = "|".join( + [ + r"unsupported operand type\(s\)", + "can only concatenate", + r"must be str", + "object to str implicitly", + ] ) with pytest.raises(TypeError, match=msg): @@ -1544,11 +1532,3 @@ def test_pi_sub_period_nat(self): exp = TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") tm.assert_index_equal(idx - Period("NaT", freq="M"), exp) tm.assert_index_equal(Period("NaT", freq="M") - idx, exp) - - @pytest.mark.parametrize("scalars", ["a", False, 1, 1.0, None]) - def test_comparison_operations(self, scalars): - # GH 28980 - expected = Series([False, False]) - s = Series([Period("2019"), Period("2020")], dtype="period[A-DEC]") - result = s == scalars - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 86980ad42766e..8078e8c90a2bf 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -84,11 +84,6 @@ def test_compare_timedelta64_zerodim(self, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(res, expected) - msg = "Invalid comparison between dtype" - with pytest.raises(TypeError, match=msg): - # zero-dim of wrong dtype should still raise - tdi >= np.array(4) - @pytest.mark.parametrize( "td_scalar", [ @@ -120,6 +115,7 @@ def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar): Timestamp.now().to_datetime64(), Timestamp.now().to_pydatetime(), Timestamp.now().date(), + np.array(4), # zero-dim mismatched dtype ], ) def test_td64_comparisons_invalid(self, box_with_array, invalid): @@ -146,17 +142,18 @@ def test_td64_comparisons_invalid(self, box_with_array, invalid): pd.period_range("1971-01-01", freq="D", periods=10).astype(object), ], ) - def test_td64arr_cmp_arraylike_invalid(self, other): + def test_td64arr_cmp_arraylike_invalid(self, other, box_with_array): # We don't parametrize this over box_with_array because listlike # other plays poorly with assert_invalid_comparison reversed checks rng = timedelta_range("1 days", periods=10)._data - assert_invalid_comparison(rng, other, tm.to_array) + rng = tm.box_expected(rng, box_with_array) + assert_invalid_comparison(rng, other, box_with_array) def test_td64arr_cmp_mixed_invalid(self): rng = timedelta_range("1 days", periods=5)._data - other = np.array([0, 1, 2, rng[3], Timestamp.now()]) + result = rng == other expected = np.array([False, False, False, True, False]) tm.assert_numpy_array_equal(result, expected) @@ -1623,10 +1620,7 @@ def test_td64arr_div_td64_scalar(self, m, unit, box_with_array): box = box_with_array xbox = np.ndarray if box is pd.array else box - startdate = Series(pd.date_range("2013-01-01", "2013-01-03")) - enddate = Series(pd.date_range("2013-03-01", "2013-03-03")) - - ser = enddate - startdate + ser = Series([Timedelta(days=59)] * 3) ser[2] = np.nan flat = ser ser = tm.box_expected(ser, box) From 9e8ab560d1ce820b78663655f0cb7a40bfc74dee Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Nov 2021 15:42:16 -0800 Subject: [PATCH 22/53] REF/TST: collect index tests (#44377) --- .../datetimelike_/test_is_monotonic.py | 46 +++++++ .../datetimes/methods/test_isocalendar.py | 20 ++++ pandas/tests/indexes/datetimes/test_asof.py | 17 +++ .../tests/indexes/datetimes/test_freq_attr.py | 61 ++++++++++ pandas/tests/indexes/datetimes/test_misc.py | 15 --- pandas/tests/indexes/datetimes/test_ops.py | 113 ++---------------- pandas/tests/indexes/period/test_freq_attr.py | 21 ++++ pandas/tests/indexes/period/test_period.py | 54 --------- pandas/tests/indexes/period/test_pickle.py | 26 ++++ .../{test_ops.py => test_resolution.py} | 15 +-- pandas/tests/indexes/test_any_index.py | 7 ++ pandas/tests/indexes/test_base.py | 28 ----- pandas/tests/indexes/test_index_new.py | 11 ++ .../indexes/timedeltas/test_freq_attr.py | 61 ++++++++++ .../tests/indexes/timedeltas/test_indexing.py | 14 +++ pandas/tests/indexes/timedeltas/test_ops.py | 72 ----------- pandas/tests/series/test_api.py | 17 +++ 17 files changed, 315 insertions(+), 283 deletions(-) create mode 100644 pandas/tests/indexes/datetimelike_/test_is_monotonic.py create mode 100644 pandas/tests/indexes/datetimes/methods/test_isocalendar.py create mode 100644 pandas/tests/indexes/datetimes/test_freq_attr.py create mode 100644 pandas/tests/indexes/period/test_freq_attr.py create mode 100644 pandas/tests/indexes/period/test_pickle.py rename pandas/tests/indexes/period/{test_ops.py => test_resolution.py} (56%) create mode 100644 pandas/tests/indexes/timedeltas/test_freq_attr.py diff --git a/pandas/tests/indexes/datetimelike_/test_is_monotonic.py b/pandas/tests/indexes/datetimelike_/test_is_monotonic.py new file mode 100644 index 0000000000000..22247c982edbc --- /dev/null +++ b/pandas/tests/indexes/datetimelike_/test_is_monotonic.py @@ -0,0 +1,46 @@ +from pandas import ( + Index, + NaT, + date_range, +) + + +def test_is_monotonic_with_nat(): + # GH#31437 + # PeriodIndex.is_monotonic should behave analogously to DatetimeIndex, + # in particular never be monotonic when we have NaT + dti = date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + tdi = Index(dti.view("timedelta64[ns]")) + + for obj in [pi, pi._engine, dti, dti._engine, tdi, tdi._engine]: + if isinstance(obj, Index): + # i.e. not Engines + assert obj.is_monotonic + assert obj.is_monotonic_increasing + assert not obj.is_monotonic_decreasing + assert obj.is_unique + + dti1 = dti.insert(0, NaT) + pi1 = dti1.to_period("D") + tdi1 = Index(dti1.view("timedelta64[ns]")) + + for obj in [pi1, pi1._engine, dti1, dti1._engine, tdi1, tdi1._engine]: + if isinstance(obj, Index): + # i.e. not Engines + assert not obj.is_monotonic + assert not obj.is_monotonic_increasing + assert not obj.is_monotonic_decreasing + assert obj.is_unique + + dti2 = dti.insert(3, NaT) + pi2 = dti2.to_period("H") + tdi2 = Index(dti2.view("timedelta64[ns]")) + + for obj in [pi2, pi2._engine, dti2, dti2._engine, tdi2, tdi2._engine]: + if isinstance(obj, Index): + # i.e. not Engines + assert not obj.is_monotonic + assert not obj.is_monotonic_increasing + assert not obj.is_monotonic_decreasing + assert obj.is_unique diff --git a/pandas/tests/indexes/datetimes/methods/test_isocalendar.py b/pandas/tests/indexes/datetimes/methods/test_isocalendar.py new file mode 100644 index 0000000000000..128a8b3e10eb3 --- /dev/null +++ b/pandas/tests/indexes/datetimes/methods/test_isocalendar.py @@ -0,0 +1,20 @@ +from pandas import ( + DataFrame, + DatetimeIndex, +) +import pandas._testing as tm + + +def test_isocalendar_returns_correct_values_close_to_new_year_with_tz(): + # GH#6538: Check that DatetimeIndex and its TimeStamp elements + # return the same weekofyear accessor close to new year w/ tz + dates = ["2013/12/29", "2013/12/30", "2013/12/31"] + dates = DatetimeIndex(dates, tz="Europe/Brussels") + result = dates.isocalendar() + expected_data_frame = DataFrame( + [[2013, 52, 7], [2014, 1, 1], [2014, 1, 2]], + columns=["year", "week", "day"], + index=dates, + dtype="UInt32", + ) + tm.assert_frame_equal(result, expected_data_frame) diff --git a/pandas/tests/indexes/datetimes/test_asof.py b/pandas/tests/indexes/datetimes/test_asof.py index c794aefc6a48b..7adc400302cb9 100644 --- a/pandas/tests/indexes/datetimes/test_asof.py +++ b/pandas/tests/indexes/datetimes/test_asof.py @@ -1,8 +1,12 @@ +from datetime import timedelta + from pandas import ( Index, Timestamp, date_range, + isna, ) +import pandas._testing as tm class TestAsOf: @@ -12,3 +16,16 @@ def test_asof_partial(self): result = index.asof("2010-02") assert result == expected assert not isinstance(result, Index) + + def test_asof(self): + index = tm.makeDateIndex(100) + + dt = index[0] + assert index.asof(dt) == dt + assert isna(index.asof(dt - timedelta(1))) + + dt = index[-1] + assert index.asof(dt + timedelta(1)) == dt + + dt = index[0].to_pydatetime() + assert isinstance(index.asof(dt), Timestamp) diff --git a/pandas/tests/indexes/datetimes/test_freq_attr.py b/pandas/tests/indexes/datetimes/test_freq_attr.py new file mode 100644 index 0000000000000..f5821a316358d --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_freq_attr.py @@ -0,0 +1,61 @@ +import pytest + +from pandas import ( + DatetimeIndex, + date_range, +) + +from pandas.tseries.offsets import ( + BDay, + DateOffset, + Day, + Hour, +) + + +class TestFreq: + def test_freq_setter_errors(self): + # GH#20678 + idx = DatetimeIndex(["20180101", "20180103", "20180105"]) + + # setting with an incompatible freq + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) + with pytest.raises(ValueError, match=msg): + idx._data.freq = "5D" + + # setting with non-freq string + with pytest.raises(ValueError, match="Invalid frequency"): + idx._data.freq = "foo" + + @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_freq_setter(self, values, freq, tz): + # GH#20678 + idx = DatetimeIndex(values, tz=tz) + + # can set to an offset, converting from string if necessary + idx._data.freq = freq + assert idx.freq == freq + assert isinstance(idx.freq, DateOffset) + + # can reset to None + idx._data.freq = None + assert idx.freq is None + + def test_freq_view_safe(self): + # Setting the freq for one DatetimeIndex shouldn't alter the freq + # for another that views the same data + + dti = date_range("2016-01-01", periods=5) + dta = dti._data + + dti2 = DatetimeIndex(dta)._with_freq(None) + assert dti2.freq is None + + # Original was not altered + assert dti.freq == "D" + assert dta.freq == "D" diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index f0757d0ba555e..44c353315562a 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -297,21 +297,6 @@ def test_week_and_weekofyear_are_deprecated(): idx.weekofyear -def test_isocalendar_returns_correct_values_close_to_new_year_with_tz(): - # GH 6538: Check that DatetimeIndex and its TimeStamp elements - # return the same weekofyear accessor close to new year w/ tz - dates = ["2013/12/29", "2013/12/30", "2013/12/31"] - dates = DatetimeIndex(dates, tz="Europe/Brussels") - result = dates.isocalendar() - expected_data_frame = pd.DataFrame( - [[2013, 52, 7], [2014, 1, 1], [2014, 1, 2]], - columns=["year", "week", "day"], - index=dates, - dtype="UInt32", - ) - tm.assert_frame_equal(result, expected_data_frame) - - def test_add_timedelta_preserves_freq(): # GH#37295 should hold for any DTI with freq=None or Tick freq tz = "Canada/Eastern" diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 7df94b5820e5d..d6ef4198fad2e 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -6,43 +6,17 @@ from pandas.compat import IS64 from pandas import ( - DateOffset, DatetimeIndex, Index, - Series, bdate_range, date_range, ) import pandas._testing as tm -from pandas.tseries.offsets import ( - BDay, - Day, - Hour, -) - START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) class TestDatetimeIndexOps: - def test_ops_properties_basic(self, datetime_series): - - # sanity check that the behavior didn't change - # GH#7206 - for op in ["year", "day", "second", "weekday"]: - msg = f"'Series' object has no attribute '{op}'" - with pytest.raises(AttributeError, match=msg): - getattr(datetime_series, op) - - # attribute access should still work! - s = Series({"year": 2000, "month": 1, "day": 10}) - assert s.year == 2000 - assert s.month == 1 - assert s.day == 10 - msg = "'Series' object has no attribute 'weekday'" - with pytest.raises(AttributeError, match=msg): - s.weekday - @pytest.mark.parametrize( "freq,expected", [ @@ -74,72 +48,28 @@ def test_infer_freq(self, freq_sample): tm.assert_index_equal(idx, result) assert result.freq == freq_sample - @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) - @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) - @pytest.mark.parametrize("tz", [None, "US/Eastern"]) - def test_freq_setter(self, values, freq, tz): - # GH 20678 - idx = DatetimeIndex(values, tz=tz) - - # can set to an offset, converting from string if necessary - idx._data.freq = freq - assert idx.freq == freq - assert isinstance(idx.freq, DateOffset) - - # can reset to None - idx._data.freq = None - assert idx.freq is None - - def test_freq_setter_errors(self): - # GH 20678 - idx = DatetimeIndex(["20180101", "20180103", "20180105"]) - - # setting with an incompatible freq - msg = ( - "Inferred frequency 2D from passed values does not conform to " - "passed frequency 5D" - ) - with pytest.raises(ValueError, match=msg): - idx._data.freq = "5D" - - # setting with non-freq string - with pytest.raises(ValueError, match="Invalid frequency"): - idx._data.freq = "foo" - - def test_freq_view_safe(self): - # Setting the freq for one DatetimeIndex shouldn't alter the freq - # for another that views the same data - - dti = date_range("2016-01-01", periods=5) - dta = dti._data - - dti2 = DatetimeIndex(dta)._with_freq(None) - assert dti2.freq is None - - # Original was not altered - assert dti.freq == "D" - assert dta.freq == "D" - +@pytest.mark.parametrize("freq", ["B", "C"]) class TestBusinessDatetimeIndex: - def setup_method(self, method): - self.rng = bdate_range(START, END) + @pytest.fixture + def rng(self, freq): + return bdate_range(START, END, freq=freq) - def test_comparison(self): - d = self.rng[10] + def test_comparison(self, rng): + d = rng[10] - comp = self.rng > d + comp = rng > d assert comp[11] assert not comp[9] - def test_copy(self): - cp = self.rng.copy() + def test_copy(self, rng): + cp = rng.copy() repr(cp) - tm.assert_index_equal(cp, self.rng) + tm.assert_index_equal(cp, rng) - def test_identical(self): - t1 = self.rng.copy() - t2 = self.rng.copy() + def test_identical(self, rng): + t1 = rng.copy() + t2 = rng.copy() assert t1.identical(t2) # name @@ -153,20 +83,3 @@ def test_identical(self): t2v = Index(t2.values) assert t1.equals(t2v) assert not t1.identical(t2v) - - -class TestCustomDatetimeIndex: - def setup_method(self, method): - self.rng = bdate_range(START, END, freq="C") - - def test_comparison(self): - d = self.rng[10] - - comp = self.rng > d - assert comp[11] - assert not comp[9] - - def test_copy(self): - cp = self.rng.copy() - repr(cp) - tm.assert_index_equal(cp, self.rng) diff --git a/pandas/tests/indexes/period/test_freq_attr.py b/pandas/tests/indexes/period/test_freq_attr.py new file mode 100644 index 0000000000000..3bf3e700e5e72 --- /dev/null +++ b/pandas/tests/indexes/period/test_freq_attr.py @@ -0,0 +1,21 @@ +import pytest + +from pandas import ( + offsets, + period_range, +) +import pandas._testing as tm + + +class TestFreq: + def test_freq_setter_deprecated(self): + # GH#20678 + idx = period_range("2018Q1", periods=4, freq="Q") + + # no warning for getter + with tm.assert_produces_warning(None): + idx.freq + + # warning for setter + with pytest.raises(AttributeError, match="can't set attribute"): + idx.freq = offsets.Day() diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index a7dad4e7f352c..f07107e9d3277 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -38,12 +38,6 @@ def index(self, request): def test_pickle_compat_construction(self): super().test_pickle_compat_construction() - @pytest.mark.parametrize("freq", ["D", "M", "A"]) - def test_pickle_round_trip(self, freq): - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq=freq) - result = tm.round_trip_pickle(idx) - tm.assert_index_equal(result, idx) - def test_where(self): # This is handled in test_indexing pass @@ -307,13 +301,6 @@ def test_with_multi_index(self): assert isinstance(s.index.values[0][0], Period) - def test_pickle_freq(self): - # GH2891 - prng = period_range("1/1/2011", "1/1/2012", freq="M") - new_prng = tm.round_trip_pickle(prng) - assert new_prng.freq == offsets.MonthEnd() - assert new_prng.freqstr == "M" - def test_map(self): # test_map_dictlike generally tests @@ -341,47 +328,6 @@ def test_maybe_convert_timedelta(): pi._maybe_convert_timedelta(offset) -def test_is_monotonic_with_nat(): - # GH#31437 - # PeriodIndex.is_monotonic should behave analogously to DatetimeIndex, - # in particular never be monotonic when we have NaT - dti = date_range("2016-01-01", periods=3) - pi = dti.to_period("D") - tdi = Index(dti.view("timedelta64[ns]")) - - for obj in [pi, pi._engine, dti, dti._engine, tdi, tdi._engine]: - if isinstance(obj, Index): - # i.e. not Engines - assert obj.is_monotonic - assert obj.is_monotonic_increasing - assert not obj.is_monotonic_decreasing - assert obj.is_unique - - dti1 = dti.insert(0, NaT) - pi1 = dti1.to_period("D") - tdi1 = Index(dti1.view("timedelta64[ns]")) - - for obj in [pi1, pi1._engine, dti1, dti1._engine, tdi1, tdi1._engine]: - if isinstance(obj, Index): - # i.e. not Engines - assert not obj.is_monotonic - assert not obj.is_monotonic_increasing - assert not obj.is_monotonic_decreasing - assert obj.is_unique - - dti2 = dti.insert(3, NaT) - pi2 = dti2.to_period("H") - tdi2 = Index(dti2.view("timedelta64[ns]")) - - for obj in [pi2, pi2._engine, dti2, dti2._engine, tdi2, tdi2._engine]: - if isinstance(obj, Index): - # i.e. not Engines - assert not obj.is_monotonic - assert not obj.is_monotonic_increasing - assert not obj.is_monotonic_decreasing - assert obj.is_unique - - @pytest.mark.parametrize("array", [True, False]) def test_dunder_array(array): obj = PeriodIndex(["2000-01-01", "2001-01-01"], freq="D") diff --git a/pandas/tests/indexes/period/test_pickle.py b/pandas/tests/indexes/period/test_pickle.py new file mode 100644 index 0000000000000..82f906d1e361f --- /dev/null +++ b/pandas/tests/indexes/period/test_pickle.py @@ -0,0 +1,26 @@ +import numpy as np +import pytest + +from pandas import ( + NaT, + PeriodIndex, + period_range, +) +import pandas._testing as tm + +from pandas.tseries import offsets + + +class TestPickle: + @pytest.mark.parametrize("freq", ["D", "M", "A"]) + def test_pickle_round_trip(self, freq): + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq=freq) + result = tm.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) + + def test_pickle_freq(self): + # GH#2891 + prng = period_range("1/1/2011", "1/1/2012", freq="M") + new_prng = tm.round_trip_pickle(prng) + assert new_prng.freq == offsets.MonthEnd() + assert new_prng.freqstr == "M" diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_resolution.py similarity index 56% rename from pandas/tests/indexes/period/test_ops.py rename to pandas/tests/indexes/period/test_resolution.py index 9ebe44fb16c8d..7ecbde75cfa47 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_resolution.py @@ -1,10 +1,9 @@ import pytest import pandas as pd -import pandas._testing as tm -class TestPeriodIndexOps: +class TestResolution: @pytest.mark.parametrize( "freq,expected", [ @@ -22,15 +21,3 @@ class TestPeriodIndexOps: def test_resolution(self, freq, expected): idx = pd.period_range(start="2013-04-01", periods=30, freq=freq) assert idx.resolution == expected - - def test_freq_setter_deprecated(self): - # GH 20678 - idx = pd.period_range("2018Q1", periods=4, freq="Q") - - # no warning for getter - with tm.assert_produces_warning(None): - idx.freq - - # warning for setter - with pytest.raises(AttributeError, match="can't set attribute"): - idx.freq = pd.offsets.Day() diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 39a1ddcbc8a6a..f7dafd78a801f 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -84,6 +84,13 @@ def test_is_type_compatible_deprecation(index): index.is_type_compatible(index.inferred_type) +def test_is_mixed_deprecated(index): + # GH#32922 + msg = "Index.is_mixed is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.is_mixed() + + class TestConversion: def test_to_series(self, index): # assert that we are creating a copy of the index diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 50be69fb93d7c..7f9a5c0b50595 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -29,7 +29,6 @@ TimedeltaIndex, Timestamp, date_range, - isna, period_range, ) import pandas._testing as tm @@ -395,15 +394,6 @@ def test_constructor_empty_special(self, empty, klass): assert isinstance(empty, klass) assert not len(empty) - def test_constructor_overflow_int64(self): - # see gh-15832 - msg = ( - "The elements provided in the data cannot " - "all be casted to the dtype int64" - ) - with pytest.raises(OverflowError, match=msg): - Index([np.iinfo(np.uint64).max - 1], dtype="int64") - @pytest.mark.parametrize( "index", [ @@ -502,18 +492,6 @@ def test_is_(self): ind2 = Index(arr, copy=False) assert not ind1.is_(ind2) - @pytest.mark.parametrize("index", ["datetime"], indirect=True) - def test_asof(self, index): - d = index[0] - assert index.asof(d) == d - assert isna(index.asof(d - timedelta(1))) - - d = index[-1] - assert index.asof(d + timedelta(1)) == d - - d = index[0].to_pydatetime() - assert isinstance(index.asof(d), Timestamp) - def test_asof_numeric_vs_bool_raises(self): left = Index([1, 2, 3]) right = Index([True, False]) @@ -699,12 +677,6 @@ def test_append_empty_preserve_name(self, name, expected): result = left.append(right) assert result.name == expected - def test_is_mixed_deprecated(self, simple_index): - # GH#32922 - index = simple_index - with tm.assert_produces_warning(FutureWarning): - index.is_mixed() - @pytest.mark.parametrize( "index, expected", [ diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 293aa6dd57124..5c5ec7219d2d7 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -272,3 +272,14 @@ def __array__(self, dtype=None) -> np.ndarray: expected = Index(array) result = Index(ArrayLike(array)) tm.assert_index_equal(result, expected) + + +class TestIndexConstructionErrors: + def test_constructor_overflow_int64(self): + # see GH#15832 + msg = ( + "The elements provided in the data cannot " + "all be casted to the dtype int64" + ) + with pytest.raises(OverflowError, match=msg): + Index([np.iinfo(np.uint64).max - 1], dtype="int64") diff --git a/pandas/tests/indexes/timedeltas/test_freq_attr.py b/pandas/tests/indexes/timedeltas/test_freq_attr.py new file mode 100644 index 0000000000000..39b9c11aa833c --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_freq_attr.py @@ -0,0 +1,61 @@ +import pytest + +from pandas import TimedeltaIndex + +from pandas.tseries.offsets import ( + DateOffset, + Day, + Hour, +) + + +class TestFreq: + @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) + def test_freq_setter(self, values, freq): + # GH#20678 + idx = TimedeltaIndex(values) + + # can set to an offset, converting from string if necessary + idx._data.freq = freq + assert idx.freq == freq + assert isinstance(idx.freq, DateOffset) + + # can reset to None + idx._data.freq = None + assert idx.freq is None + + def test_freq_setter_errors(self): + # GH#20678 + idx = TimedeltaIndex(["0 days", "2 days", "4 days"]) + + # setting with an incompatible freq + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) + with pytest.raises(ValueError, match=msg): + idx._data.freq = "5D" + + # setting with a non-fixed frequency + msg = r"<2 \* BusinessDays> is a non-fixed frequency" + with pytest.raises(ValueError, match=msg): + idx._data.freq = "2B" + + # setting with non-freq string + with pytest.raises(ValueError, match="Invalid frequency"): + idx._data.freq = "foo" + + def test_freq_view_safe(self): + # Setting the freq for one TimedeltaIndex shouldn't alter the freq + # for another that views the same data + + tdi = TimedeltaIndex(["0 days", "2 days", "4 days"], freq="2D") + tda = tdi._data + + tdi2 = TimedeltaIndex(tda)._with_freq(None) + assert tdi2.freq is None + + # Original was not altered + assert tdi.freq == "2D" + assert tda.freq == "2D" diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index fc8abb83ed302..66fdaa2778600 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -340,3 +340,17 @@ def test_slice_invalid_str_with_timedeltaindex( indexer_sl(obj)[:"foo"] with pytest.raises(TypeError, match=msg): indexer_sl(obj)[tdi[0] : "foo"] + + +class TestContains: + def test_contains_nonunique(self): + # GH#9512 + for vals in ( + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["00:01:00", "00:01:00", "00:02:00"], + ["00:01:00", "00:01:00", "00:00:01"], + ): + idx = TimedeltaIndex(vals) + assert idx[0] in idx diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index f5d601bcfbcd1..f6013baf86edc 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -1,86 +1,14 @@ -import pytest - from pandas import ( TimedeltaIndex, timedelta_range, ) import pandas._testing as tm -from pandas.tseries.offsets import ( - DateOffset, - Day, - Hour, -) - class TestTimedeltaIndexOps: - def test_nonunique_contains(self): - # GH 9512 - for idx in map( - TimedeltaIndex, - ( - [0, 1, 0], - [0, 0, -1], - [0, -1, -1], - ["00:01:00", "00:01:00", "00:02:00"], - ["00:01:00", "00:01:00", "00:00:01"], - ), - ): - assert idx[0] in idx - def test_infer_freq(self, freq_sample): # GH#11018 idx = timedelta_range("1", freq=freq_sample, periods=10) result = TimedeltaIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) assert result.freq == freq_sample - - @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) - @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) - def test_freq_setter(self, values, freq): - # GH 20678 - idx = TimedeltaIndex(values) - - # can set to an offset, converting from string if necessary - idx._data.freq = freq - assert idx.freq == freq - assert isinstance(idx.freq, DateOffset) - - # can reset to None - idx._data.freq = None - assert idx.freq is None - - def test_freq_setter_errors(self): - # GH 20678 - idx = TimedeltaIndex(["0 days", "2 days", "4 days"]) - - # setting with an incompatible freq - msg = ( - "Inferred frequency 2D from passed values does not conform to " - "passed frequency 5D" - ) - with pytest.raises(ValueError, match=msg): - idx._data.freq = "5D" - - # setting with a non-fixed frequency - msg = r"<2 \* BusinessDays> is a non-fixed frequency" - with pytest.raises(ValueError, match=msg): - idx._data.freq = "2B" - - # setting with non-freq string - with pytest.raises(ValueError, match="Invalid frequency"): - idx._data.freq = "foo" - - def test_freq_view_safe(self): - # Setting the freq for one TimedeltaIndex shouldn't alter the freq - # for another that views the same data - - tdi = TimedeltaIndex(["0 days", "2 days", "4 days"], freq="2D") - tda = tdi._data - - tdi2 = TimedeltaIndex(tda)._with_freq(None) - assert tdi2.freq is None - - # Original was not altered - assert tdi.freq == "2D" - assert tda.freq == "2D" diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index aaf98e46f2f09..4e4eb89328540 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -191,3 +191,20 @@ def test_unknown_attribute(self): msg = "'Series' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): ser.foo + + def test_datetime_series_no_datelike_attrs(self, datetime_series): + # GH#7206 + for op in ["year", "day", "second", "weekday"]: + msg = f"'Series' object has no attribute '{op}'" + with pytest.raises(AttributeError, match=msg): + getattr(datetime_series, op) + + def test_series_datetimelike_attribute_access(self): + # attribute access should still work! + ser = Series({"year": 2000, "month": 1, "day": 10}) + assert ser.year == 2000 + assert ser.month == 1 + assert ser.day == 10 + msg = "'Series' object has no attribute 'weekday'" + with pytest.raises(AttributeError, match=msg): + ser.weekday From eca9f6cf17683a6720752fb9d246534c5d1196ce Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 12 Nov 2021 04:09:35 +0100 Subject: [PATCH 23/53] Fixed regression in Series.duplicated for categorical dtype with bool categories (#44356) --- doc/source/whatsnew/v1.3.5.rst | 1 + pandas/core/algorithms.py | 2 +- .../series/methods/test_drop_duplicates.py | 15 +++++++++++++++ pandas/tests/series/methods/test_duplicated.py | 18 +++++++++++++++++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index 589092c0dd7e3..951b05b65c81b 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`Series.equals` when comparing floats with dtype object to None (:issue:`44190`) - Fixed performance regression in :func:`read_csv` (:issue:`44106`) +- Fixed regression in :meth:`Series.duplicated` and :meth:`Series.drop_duplicates` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c1b587ce3a6b2..8c2c01b6aedc8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -148,7 +148,7 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: # i.e. all-bool Categorical, BooleanArray try: return np.asarray(values).astype("uint8", copy=False) - except TypeError: + except (TypeError, ValueError): # GH#42107 we have pd.NAs present return np.asarray(values) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 7eb51f8037792..f72d85337df8e 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + NA, Categorical, Series, ) @@ -224,6 +225,20 @@ def test_drop_duplicates_categorical_bool(self, ordered): assert return_value is None tm.assert_series_equal(sc, tc[~expected]) + def test_drop_duplicates_categorical_bool_na(self): + # GH#44351 + ser = Series( + Categorical( + [True, False, True, False, NA], categories=[True, False], ordered=True + ) + ) + result = ser.drop_duplicates() + expected = Series( + Categorical([True, False, np.nan], categories=[True, False], ordered=True), + index=[0, 1, 4], + ) + tm.assert_series_equal(result, expected) + def test_drop_duplicates_pos_args_deprecation(): # GH#41485 diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index 5cc297913e851..c61492168da63 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -1,7 +1,11 @@ import numpy as np import pytest -from pandas import Series +from pandas import ( + NA, + Categorical, + Series, +) import pandas._testing as tm @@ -33,3 +37,15 @@ def test_duplicated_nan_none(keep, expected): result = ser.duplicated(keep=keep) tm.assert_series_equal(result, expected) + + +def test_duplicated_categorical_bool_na(): + # GH#44351 + ser = Series( + Categorical( + [True, False, True, False, NA], categories=[True, False], ordered=True + ) + ) + result = ser.duplicated() + expected = Series([False, False, True, True, False]) + tm.assert_series_equal(result, expected) From dc6154764c23fddca715de7d8c33e410078b44bc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 12 Nov 2021 04:10:37 +0100 Subject: [PATCH 24/53] CLN: Refactor extract multiindex header call (#44399) --- pandas/io/parsers/base_parser.py | 4 ++-- pandas/io/parsers/c_parser_wrapper.py | 31 +++++++++++---------------- pandas/io/parsers/python_parser.py | 28 +++++++++--------------- 3 files changed, 24 insertions(+), 39 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 8cdcc05f60266..339585810bec1 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -314,14 +314,14 @@ def _should_parse_dates(self, i: int) -> bool: @final def _extract_multi_indexer_columns( - self, header, index_names, col_names, passed_names: bool = False + self, header, index_names, passed_names: bool = False ): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ if len(header) < 2: - return header[0], index_names, col_names, passed_names + return header[0], index_names, None, passed_names # the names are the tuples of the header that are not the index cols # 0 is the name of the index, assuming index_col is a list of column diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 32ca3aaeba6cc..352dd998dda0f 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -78,25 +78,18 @@ def __init__(self, src: FilePathOrBuffer, **kwds): if self._reader.header is None: self.names = None else: - if len(self._reader.header) > 1: - # we have a multi index in the columns - # error: Cannot determine type of 'names' - # error: Cannot determine type of 'index_names' - # error: Cannot determine type of 'col_names' - ( - self.names, # type: ignore[has-type] - self.index_names, - self.col_names, - passed_names, - ) = self._extract_multi_indexer_columns( - self._reader.header, - self.index_names, # type: ignore[has-type] - self.col_names, # type: ignore[has-type] - passed_names, - ) - else: - # error: Cannot determine type of 'names' - self.names = list(self._reader.header[0]) # type: ignore[has-type] + # error: Cannot determine type of 'names' + # error: Cannot determine type of 'index_names' + ( + self.names, # type: ignore[has-type] + self.index_names, + self.col_names, + passed_names, + ) = self._extract_multi_indexer_columns( + self._reader.header, + self.index_names, # type: ignore[has-type] + passed_names, + ) # error: Cannot determine type of 'names' if self.names is None: # type: ignore[has-type] diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index af253fc062632..b0e868b260369 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -117,24 +117,16 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. - if len(self.columns) > 1: - # we are processing a multi index column - # error: Cannot determine type of 'index_names' - # error: Cannot determine type of 'col_names' - ( - self.columns, - self.index_names, - self.col_names, - _, - ) = self._extract_multi_indexer_columns( - self.columns, - self.index_names, # type: ignore[has-type] - self.col_names, # type: ignore[has-type] - ) - # Update list of original names to include all indices. - self.num_original_columns = len(self.columns) - else: - self.columns = self.columns[0] + # error: Cannot determine type of 'index_names' + ( + self.columns, + self.index_names, + self.col_names, + _, + ) = self._extract_multi_indexer_columns( + self.columns, + self.index_names, # type: ignore[has-type] + ) # get popped off for index self.orig_names: list[int | str | tuple] = list(self.columns) From 66147c9b41371eb812d8dabd55c44aa1cbc69d91 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 11 Nov 2021 19:11:32 -0800 Subject: [PATCH 25/53] BUG: DataFrame.stack with EA columns (#44401) --- doc/source/whatsnew/v1.4.0.rst | 2 ++ pandas/core/reshape/reshape.py | 4 +++- pandas/tests/frame/test_stack_unstack.py | 24 ++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8732e1c397ce5..d1e209adb1b8f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -623,6 +623,8 @@ Reshaping - Bug in :func:`crosstab` would fail when inputs are lists or tuples (:issue:`44076`) - Bug in :meth:`DataFrame.append` failing to retain ``index.name`` when appending a list of :class:`Series` objects (:issue:`44109`) - Fixed metadata propagation in :meth:`Dataframe.apply` method, consequently fixing the same issue for :meth:`Dataframe.transform`, :meth:`Dataframe.nunique` and :meth:`Dataframe.mode` (:issue:`28283`) +- Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) +- Sparse ^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9c7107ab40644..6c6b14653df75 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -745,13 +745,15 @@ def _convert_level_number(level_num, columns): if frame._is_homogeneous_type and is_extension_array_dtype( frame.dtypes.iloc[0] ): + # TODO(EA2D): won't need special case, can go through .values + # paths below (might change to ._values) dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( [x._values for _, x in subset.items()] ) - N, K = this.shape + N, K = subset.shape idx = np.arange(N * K).reshape(K, N).T.ravel() value_slice = value_slice.take(idx) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 404baecdfecac..62512249dabfc 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2099,3 +2099,27 @@ def test_stack_unsorted(self): result = DF.stack(["VAR", "TYP"]).sort_index() expected = DF.sort_index(axis=1).stack(["VAR", "TYP"]).sort_index() tm.assert_series_equal(result, expected) + + def test_stack_nullable_dtype(self): + # GH#43561 + columns = MultiIndex.from_product( + [["54511", "54515"], ["r", "t_mean"]], names=["station", "element"] + ) + index = Index([1, 2, 3], name="time") + + arr = np.array([[50, 226, 10, 215], [10, 215, 9, 220], [305, 232, 111, 220]]) + df = DataFrame(arr, columns=columns, index=index, dtype=pd.Int64Dtype()) + + result = df.stack("station") + + expected = df.astype(np.int64).stack("station").astype(pd.Int64Dtype()) + tm.assert_frame_equal(result, expected) + + # non-homogeneous case + df[df.columns[0]] = df[df.columns[0]].astype(pd.Float64Dtype()) + result = df.stack("station") + + # TODO(EA2D): we get object dtype because DataFrame.values can't + # be an EA + expected = df.astype(object).stack("station") + tm.assert_frame_equal(result, expected) From a0a76a13540b34221899b28db8308ac55568b3be Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 11 Nov 2021 22:19:05 -0500 Subject: [PATCH 26/53] ENH: Use find_stack_level in pandas.core (#44358) --- pandas/core/accessor.py | 3 +- pandas/core/arraylike.py | 3 +- pandas/core/arrays/categorical.py | 20 +++++++------- pandas/core/arrays/datetimes.py | 4 +-- pandas/core/arrays/sparse/array.py | 4 +-- pandas/core/arrays/sparse/dtype.py | 3 +- pandas/core/common.py | 3 +- pandas/core/computation/align.py | 5 +++- pandas/core/computation/eval.py | 3 +- pandas/core/config_init.py | 4 ++- pandas/core/construction.py | 5 ++-- pandas/core/describe.py | 3 +- pandas/core/dtypes/cast.py | 18 ++++++------ pandas/core/dtypes/common.py | 5 ++-- pandas/core/frame.py | 25 +++++++++-------- pandas/core/generic.py | 28 +++++++++---------- pandas/core/groupby/generic.py | 3 +- pandas/core/index.py | 4 ++- pandas/core/indexers/utils.py | 2 +- pandas/core/indexes/accessors.py | 4 ++- pandas/core/indexes/base.py | 40 +++++++++++++-------------- pandas/core/indexes/category.py | 7 +++-- pandas/core/indexes/datetimelike.py | 3 +- pandas/core/indexes/datetimes.py | 8 +++--- pandas/core/indexes/multi.py | 21 +++++++------- pandas/core/indexes/numeric.py | 3 +- pandas/core/indexes/period.py | 3 +- pandas/core/indexes/range.py | 9 +++--- pandas/core/indexing.py | 5 ++-- pandas/core/internals/__init__.py | 4 ++- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/construction.py | 3 +- pandas/core/internals/managers.py | 4 +-- pandas/core/ops/__init__.py | 3 +- pandas/core/reshape/melt.py | 3 +- pandas/core/reshape/merge.py | 7 +++-- pandas/core/series.py | 13 +++++---- pandas/core/strings/accessor.py | 5 ++-- pandas/core/tools/datetimes.py | 3 +- pandas/core/window/ewm.py | 2 +- pandas/core/window/rolling.py | 7 +++-- 41 files changed, 169 insertions(+), 135 deletions(-) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index c31368f179ef0..07fa5799fe371 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -9,6 +9,7 @@ import warnings from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level class DirNamesMixin: @@ -267,7 +268,7 @@ def decorator(accessor): f"{repr(name)} for type {repr(cls)} is overriding a preexisting " f"attribute with the same name.", UserWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) setattr(cls, name, CachedAccessor(name, accessor)) cls._accessors.add(name) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index fe09a044566f8..11d32e8a159f3 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -11,6 +11,7 @@ import numpy as np from pandas._libs import lib +from pandas.util._exceptions import find_stack_level from pandas.core.construction import extract_array from pandas.core.ops import ( @@ -210,7 +211,7 @@ def _maybe_fallback(ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any): "or align manually (eg 'df1, df2 = df1.align(df2)') before passing to " "the ufunc to obtain the future behaviour and silence this warning.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) # keep the first dataframe of the inputs, other DataFrame/Series is diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 145ff60a28f46..f205773d1b03d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -390,7 +390,7 @@ def __init__( "Allowing scalars in the Categorical constructor is deprecated " "and will raise in a future version. Use `[value]` instead", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) values = [values] @@ -940,7 +940,7 @@ def set_categories( "a future version. Removing unused categories will always " "return a new Categorical object.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: inplace = False @@ -1040,7 +1040,7 @@ def rename_categories(self, new_categories, inplace=no_default): "a future version. Removing unused categories will always " "return a new Categorical object.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: inplace = False @@ -1172,7 +1172,7 @@ def add_categories(self, new_categories, inplace=no_default): "a future version. Removing unused categories will always " "return a new Categorical object.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: inplace = False @@ -1247,7 +1247,7 @@ def remove_categories(self, removals, inplace=no_default): "a future version. Removing unused categories will always " "return a new Categorical object.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: inplace = False @@ -1322,7 +1322,7 @@ def remove_unused_categories(self, inplace=no_default): "remove_unused_categories is deprecated and " "will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: inplace = False @@ -1879,7 +1879,7 @@ def to_dense(self) -> np.ndarray: "Categorical.to_dense is deprecated and will be removed in " "a future version. Use np.asarray(cat) instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return np.asarray(self) @@ -1896,7 +1896,7 @@ def _codes(self, value: np.ndarray): "Setting the codes on a Categorical is deprecated and will raise in " "a future version. Create a new Categorical object instead", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) # GH#40606 NDArrayBacked.__init__(self, value, self.dtype) @@ -1919,7 +1919,7 @@ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): warn( "Categorical.take_nd is deprecated, use Categorical.take instead", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) @@ -2339,7 +2339,7 @@ def is_dtype_equal(self, other) -> bool: "Categorical.is_dtype_equal is deprecated and will be removed " "in a future version", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) try: return self._categories_match_up_to_permutation(other) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 4fecbe4be9681..a0a7ef3501d7f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1206,7 +1206,7 @@ def to_perioddelta(self, freq) -> TimedeltaArray: "Use `dtindex - dtindex.to_period(freq).to_timestamp()` instead.", FutureWarning, # stacklevel chosen to be correct for when called from DatetimeIndex - stacklevel=3, + stacklevel=find_stack_level(), ) from pandas.core.arrays.timedeltas import TimedeltaArray @@ -1373,7 +1373,7 @@ def weekofyear(self): "weekofyear and return an Index, you may call " "pd.Int64Index(idx.isocalendar().week)", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) week_series = self.isocalendar().week if week_series.hasnans: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 960544a2f89ea..c054710a01f75 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -467,7 +467,7 @@ def __init__( "loses timezone information. Cast to object before " "sparse to retain timezone information.", UserWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) data = np.asarray(data, dtype="datetime64[ns]") if fill_value is NaT: @@ -1089,7 +1089,7 @@ def searchsorted( ) -> npt.NDArray[np.intp] | np.intp: msg = "searchsorted requires high memory usage." - warnings.warn(msg, PerformanceWarning, stacklevel=2) + warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) if not is_scalar(v): v = np.asarray(v) v = np.asarray(v) diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index 915e13bc3bbb2..d23e217e605c7 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -16,6 +16,7 @@ type_t, ) from pandas.errors import PerformanceWarning +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, @@ -389,7 +390,7 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: f"values: '{fill_values}'. Picking the first and " "converting the rest.", PerformanceWarning, - stacklevel=6, + stacklevel=find_stack_level(), ) np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] diff --git a/pandas/core/common.py b/pandas/core/common.py index 2bf925466e176..590296c4b12f5 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -36,6 +36,7 @@ Scalar, T, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -175,7 +176,7 @@ def cast_scalar_indexer(val, warn_float: bool = False): "Indexing with a float is deprecated, and will raise an IndexError " "in pandas 2.0. You can manually convert to an integer key instead.", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) return int(val) return val diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index a4bd0270f9451..f14882227ddd9 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -16,6 +16,7 @@ import numpy as np from pandas.errors import PerformanceWarning +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -126,7 +127,9 @@ def _align_core(terms): f"than an order of magnitude on term {repr(terms[i].name)}, " f"by more than {ordm:.4g}; performance may suffer." ) - warnings.warn(w, category=PerformanceWarning, stacklevel=6) + warnings.warn( + w, category=PerformanceWarning, stacklevel=find_stack_level() + ) f = partial(ti.reindex, reindexer, axis=axis, copy=False) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 26748eadb4c85..d82cc37b90ad4 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -7,6 +7,7 @@ import warnings from pandas._libs.lib import no_default +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.computation.engines import ENGINES @@ -308,7 +309,7 @@ def eval( "will be removed in a future version." ), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) exprs: list[str | BinOp] diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0081f8cd074b6..31c2ec8f0cbf9 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -25,6 +25,8 @@ is_text, ) +from pandas.util._exceptions import find_stack_level + # compute use_bottleneck_doc = """ @@ -373,7 +375,7 @@ def _deprecate_negative_int_max_colwidth(key): "will not be supported in future version. Instead, use None " "to not limit the column width.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) cf.register_option( diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c6f131a9daba6..e3b41f2c7b8c2 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -25,6 +25,7 @@ DtypeObj, ) from pandas.errors import IntCastingNaNError +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, @@ -538,7 +539,7 @@ def sanitize_array( "if they cannot be cast losslessly (matching Series behavior). " "To retain the old behavior, use DataFrame(data).astype(dtype)", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) # GH#40110 until the deprecation is enforced, we _dont_ # ignore the dtype for DataFrame, and _do_ cast even though @@ -777,7 +778,7 @@ def _try_cast( "passed to 'DataFrame', either all columns will be cast to that " "dtype, or a TypeError will be raised.", FutureWarning, - stacklevel=7, + stacklevel=find_stack_level(), ) subarr = np.array(arr, dtype=object, copy=copy) return subarr diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 2c4a340e8c8ea..8d88ce280d5c8 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -23,6 +23,7 @@ from pandas._libs.tslibs import Timestamp from pandas._typing import NDFrameT +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_percentile from pandas.core.dtypes.common import ( @@ -377,7 +378,7 @@ def select_describe_func( "version of pandas. Specify `datetime_is_numeric=True` to " "silence this warning and adopt the future behavior now.", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) return describe_timestamp_as_categorical_1d elif is_timedelta64_dtype(data.dtype): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 432074a8dd699..2c26d6f838315 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -563,7 +563,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): "dtype is deprecated. In a future version, this will be cast " "to object dtype. Pass `fill_value=Timestamp(date_obj)` instead.", FutureWarning, - stacklevel=8, + stacklevel=find_stack_level(), ) return dtype, fv elif isinstance(fill_value, str): @@ -1133,7 +1133,7 @@ def astype_nansafe( "Use .view(...) instead.", FutureWarning, # stacklevel chosen to be correct when reached via Series.astype - stacklevel=7, + stacklevel=find_stack_level(), ) if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") @@ -1155,7 +1155,7 @@ def astype_nansafe( "Use .view(...) instead.", FutureWarning, # stacklevel chosen to be correct when reached via Series.astype - stacklevel=7, + stacklevel=find_stack_level(), ) if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") @@ -1651,7 +1651,7 @@ def maybe_cast_to_datetime( "`pd.Series(values).dt.tz_localize(None)` " "instead.", FutureWarning, - stacklevel=8, + stacklevel=find_stack_level(), ) # equiv: dta.view(dtype) # Note: NOT equivalent to dta.astype(dtype) @@ -1691,7 +1691,7 @@ def maybe_cast_to_datetime( ".tz_localize('UTC').tz_convert(dtype.tz) " "or pd.Series(data.view('int64'), dtype=dtype)", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) value = dta.tz_localize("UTC").tz_convert(dtype.tz) @@ -1859,7 +1859,7 @@ def construct_2d_arraylike_from_scalar( shape = (length, width) if dtype.kind in ["m", "M"]: - value = maybe_unbox_datetimelike_tz_deprecation(value, dtype, stacklevel=4) + value = maybe_unbox_datetimelike_tz_deprecation(value, dtype) # error: Non-overlapping equality check (left operand type: "dtype[Any]", right # operand type: "Type[object]") elif dtype == object: # type: ignore[comparison-overlap] @@ -1932,9 +1932,7 @@ def construct_1d_arraylike_from_scalar( return subarr -def maybe_unbox_datetimelike_tz_deprecation( - value: Scalar, dtype: DtypeObj, stacklevel: int = 5 -): +def maybe_unbox_datetimelike_tz_deprecation(value: Scalar, dtype: DtypeObj): """ Wrap maybe_unbox_datetimelike with a check for a timezone-aware Timestamp along with a timezone-naive datetime64 dtype, which is deprecated. @@ -1963,7 +1961,7 @@ def maybe_unbox_datetimelike_tz_deprecation( "`pd.Series(values).dt.tz_localize(None)` " "instead.", FutureWarning, - stacklevel=stacklevel, + stacklevel=find_stack_level(), ) new_value = value.tz_localize(None) return maybe_unbox_datetimelike(new_value, dtype) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 815a0a2040ddb..7ac8e6c47158c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -22,6 +22,7 @@ ArrayLike, DtypeObj, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import _registry as registry from pandas.core.dtypes.dtypes import ( @@ -304,7 +305,7 @@ def is_categorical(arr) -> bool: "is_categorical is deprecated and will be removed in a future version. " "Use is_categorical_dtype instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) @@ -1378,7 +1379,7 @@ def is_extension_type(arr) -> bool: "'is_extension_type' is deprecated and will be removed in a future " "version. Use 'is_extension_array_dtype' instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if is_categorical_dtype(arr): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1f26b6d9ae6ae..b01de5dec610d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -83,6 +83,7 @@ doc, rewrite_axis_style_signature, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_ascending, validate_axis_style_args, @@ -643,7 +644,7 @@ def __init__( "removed in a future version. Pass " "{name: data[name] for name in data.dtype.names} instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) # a masked array @@ -1793,7 +1794,7 @@ def to_dict(self, orient: str = "dict", into=dict): warnings.warn( "DataFrame columns are not unique, some columns will be omitted.", UserWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) # GH16122 into_c = com.standardize_mapping(into) @@ -1814,7 +1815,7 @@ def to_dict(self, orient: str = "dict", into=dict): "will be used in a future version. Use one of the above " "to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if orient.startswith("d"): @@ -2659,7 +2660,7 @@ def to_markdown( "'showindex' is deprecated. Only 'index' will be used " "in a future version. Use 'index' to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) kwargs.setdefault("headers", "keys") @@ -3218,7 +3219,7 @@ def info( warnings.warn( "null_counts is deprecated. Use show_counts instead", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) show_counts = null_counts info = DataFrameInfo( @@ -3591,7 +3592,7 @@ def _getitem_bool_array(self, key): warnings.warn( "Boolean Series key will be reindexed to match DataFrame index.", UserWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) elif len(key) != len(self.index): raise ValueError( @@ -4637,7 +4638,7 @@ def lookup( "You can use DataFrame.melt and DataFrame.loc " "as a substitute." ) - warnings.warn(msg, FutureWarning, stacklevel=2) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) n = len(row_labels) if n != len(col_labels): @@ -7754,7 +7755,7 @@ def groupby( "will be removed in a future version." ), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: squeeze = False @@ -9844,7 +9845,7 @@ def count( "deprecated and will be removed in a future version. Use groupby " "instead. df.count(level=1) should use df.groupby(level=1).count().", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self._count_level(level, axis=axis, numeric_only=numeric_only) @@ -9944,7 +9945,7 @@ def _reduce( "will include datetime64 and datetime64tz columns in a " "future version.", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) # Non-copy equivalent to # dt64_cols = self.dtypes.apply(is_datetime64_any_dtype) @@ -10019,7 +10020,7 @@ def _get_data() -> DataFrame: "version this will raise TypeError. Select only valid " "columns before calling the reduction.", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) return out @@ -10052,7 +10053,7 @@ def _get_data() -> DataFrame: "version this will raise TypeError. Select only valid " "columns before calling the reduction.", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) if hasattr(result, "dtype"): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 93bf70c27f8ff..23608cf0192df 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3647,7 +3647,7 @@ class max_speed "is_copy is deprecated and will be removed in a future version. " "'take' always returns a copy, so there is no need to specify this.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) nv.validate_take((), kwargs) @@ -3781,7 +3781,7 @@ class animal locomotion "Passing lists as key for xs is deprecated and will be removed in a " "future version. Pass key as a tuple instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if level is not None: @@ -5556,7 +5556,7 @@ def __setattr__(self, name: str, value) -> None: "created via a new attribute name - see " "https://pandas.pydata.org/pandas-docs/" "stable/indexing.html#attribute-access", - stacklevel=2, + stacklevel=find_stack_level(), ) object.__setattr__(self, name, value) @@ -7774,7 +7774,7 @@ def between_time( "`include_start` and `include_end` are deprecated in " "favour of `inclusive`.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) left = True if isinstance(include_start, lib.NoDefault) else include_start right = True if isinstance(include_end, lib.NoDefault) else include_end @@ -9190,7 +9190,7 @@ def where( "try_cast keyword is deprecated and will be removed in a " "future version.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) return self._where(cond, other, inplace, axis, level, errors=errors) @@ -9222,7 +9222,7 @@ def mask( "try_cast keyword is deprecated and will be removed in a " "future version.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) # see gh-21891 @@ -9415,7 +9415,7 @@ def slice_shift(self: NDFrameT, periods: int = 1, axis=0) -> NDFrameT: "and will be removed in a future version. " "You can use DataFrame/Series.shift instead." ) - warnings.warn(msg, FutureWarning, stacklevel=2) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) if periods == 0: return self @@ -9467,7 +9467,7 @@ def tshift(self: NDFrameT, periods: int = 1, freq=None, axis: Axis = 0) -> NDFra "Please use shift instead." ), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if freq is None: @@ -10282,7 +10282,7 @@ def _logical_func( "deprecated and will be removed in a future version. Use groupby " "instead. df.any(level=1) should use df.groupby(level=1).any()", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) if bool_only is not None: raise NotImplementedError( @@ -10378,7 +10378,7 @@ def _stat_function_ddof( "deprecated and will be removed in a future version. Use groupby " "instead. df.var(level=1) should use df.groupby(level=1).var().", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) return self._agg_by_level( name, axis=axis, level=level, skipna=skipna, ddof=ddof @@ -10431,7 +10431,7 @@ def _stat_function( "deprecated and will be removed in a future version. Use groupby " "instead. df.median(level=1) should use df.groupby(level=1).median().", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) return self._agg_by_level( name, axis=axis, level=level, skipna=skipna, numeric_only=numeric_only @@ -10498,7 +10498,7 @@ def _min_count_stat_function( "deprecated and will be removed in a future version. Use groupby " "instead. df.sum(level=1) should use df.groupby(level=1).sum().", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) return self._agg_by_level( name, @@ -10582,7 +10582,7 @@ def mad(self, axis=None, skipna=None, level=None): "deprecated and will be removed in a future version. Use groupby " "instead. df.mad(level=1) should use df.groupby(level=1).mad()", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna) @@ -10980,7 +10980,7 @@ def expanding( warnings.warn( "The `center` argument on `expanding` will be removed in the future.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: center = False diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8a330d08bef78..3c45f7263265c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -37,6 +37,7 @@ Substitution, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_int64, @@ -1330,7 +1331,7 @@ def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: "Indexing with multiple keys (implicitly converted to a tuple " "of keys) will be deprecated, use a list instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return super().__getitem__(key) diff --git a/pandas/core/index.py b/pandas/core/index.py index 13a687b1c27e3..00ca6f9048a40 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,5 +1,7 @@ import warnings +from pandas.util._exceptions import find_stack_level + from pandas.core.indexes.api import ( # noqa:F401 CategoricalIndex, DatetimeIndex, @@ -26,5 +28,5 @@ "pandas.core.index is deprecated and will be removed in a future version. " "The public classes are available in the top-level namespace.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) diff --git a/pandas/core/indexers/utils.py b/pandas/core/indexers/utils.py index b1824413512c5..41920727c50fd 100644 --- a/pandas/core/indexers/utils.py +++ b/pandas/core/indexers/utils.py @@ -399,7 +399,7 @@ def unpack_1tuple(tup): "slice is deprecated and will raise in a future " "version. Pass a tuple instead.", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) return tup[0] diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index b8f4b5f9d3423..3aad1140294e5 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -8,6 +8,8 @@ import numpy as np +from pandas.util._exceptions import find_stack_level + from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_dtype, @@ -286,7 +288,7 @@ def weekofyear(self): "Series.dt.weekofyear and Series.dt.week have been deprecated. " "Please use Series.dt.isocalendar().week instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) week_series = self.isocalendar().week week_series.name = self.name diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2514702b036dd..9715bf8f61f3c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -399,7 +399,7 @@ def __new__( "'tupleize_cols' is deprecated and will raise TypeError in a " "future version. Use the specific Index subclass directly instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) from pandas.core.arrays import PandasArray @@ -632,7 +632,7 @@ def asi8(self): warnings.warn( "Index.asi8 is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return None @@ -746,7 +746,7 @@ def _get_attributes_dict(self) -> dict[str_t, Any]: "The Index._get_attributes_dict method is deprecated, and will be " "removed in a future version", DeprecationWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return {k: getattr(self, k, None) for k in self._attributes} @@ -919,7 +919,7 @@ def ravel(self, order="C"): "Index.ravel returning ndarray is deprecated; in a future version " "this will return a view on self.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if needs_i8_conversion(self.dtype): # Item "ndarray[Any, Any]" of "Union[ExtensionArray, ndarray[Any, Any]]" @@ -1191,7 +1191,7 @@ def copy( "parameter dtype is deprecated and will be removed in a future " "version. Use the astype method instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) new_index = new_index.astype(dtype) return new_index @@ -1371,7 +1371,7 @@ def to_native_types(self, slicer=None, **kwargs) -> np.ndarray: "The 'to_native_types' method is deprecated and will be removed in " "a future version. Use 'astype(str)' instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) values = self if slicer is not None: @@ -2503,7 +2503,7 @@ def is_mixed(self) -> bool: "Index.is_mixed is deprecated and will be removed in a future version. " "Check index.inferred_type directly instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.inferred_type in ["mixed"] @@ -2538,7 +2538,7 @@ def is_all_dates(self) -> bool: "Index.is_all_dates is deprecated, will be removed in a future version. " "check index.inferred_type instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self._is_all_dates @@ -2905,7 +2905,7 @@ def __and__(self, other): "in the future this will be a logical operation matching " "Series.__and__. Use index.intersection(other) instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.intersection(other) @@ -2916,7 +2916,7 @@ def __or__(self, other): "in the future this will be a logical operation matching " "Series.__or__. Use index.union(other) instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.union(other) @@ -2927,7 +2927,7 @@ def __xor__(self, other): "in the future this will be a logical operation matching " "Series.__xor__. Use index.symmetric_difference(other) instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.symmetric_difference(other) @@ -3073,7 +3073,7 @@ def union(self, other, sort=None): "object dtype. To retain the old behavior, " "use `index.astype(object).union(other)`", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) dtype = self._find_common_type_compat(other) @@ -3524,7 +3524,7 @@ def get_loc(self, key, method=None, tolerance=None): "and will raise in a future version. Use " "index.get_indexer([item], method=...) instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if is_scalar(key) and isna(key) and not self.hasnans: @@ -3958,7 +3958,7 @@ def is_int(v): "and will raise TypeError in a future version. " "Use .loc with labels or .iloc with positions instead.", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) indexer = key else: @@ -4107,7 +4107,7 @@ def reindex( "reindexing with a non-unique Index is deprecated and " "will raise in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) target = self._wrap_reindex_result(target, indexer, preserve_names) @@ -4848,7 +4848,7 @@ def is_type_compatible(self, kind: str_t) -> bool: "Index.is_type_compatible is deprecated and will be removed in a " "future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return kind == self.inferred_type @@ -5485,7 +5485,7 @@ def get_value(self, series: Series, key): "get_value is deprecated and will be removed in a future version. " "Use Series[key] instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) self._check_indexing_error(key) @@ -5553,7 +5553,7 @@ def set_value(self, arr, key, value): "will be removed in a future version." ), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) loc = self._engine.get_loc(key) validate_numeric_casting(arr.dtype, value) @@ -7023,7 +7023,7 @@ def _maybe_cast_data_without_dtype( "In a future version, the Index constructor will not infer numeric " "dtypes when passed object-dtype sequences (matching Series behavior)", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) if result.dtype.kind in ["b", "c"]: return subarr @@ -7081,6 +7081,6 @@ def _maybe_try_sort(result, sort): warnings.warn( f"{err}, sort order is undefined for incomparable objects.", RuntimeWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) return result diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e2dd5ecfde5a8..f26a24c38b19f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -17,6 +17,7 @@ npt, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -218,7 +219,7 @@ def __new__( "deprecated and will raise in a future version. " "Use CategoricalIndex([], ...) instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) data = [] @@ -431,7 +432,7 @@ def reindex( "reindexing with a non-unique Index is deprecated and will " "raise in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if len(self) and indexer is not None: @@ -506,7 +507,7 @@ def take_nd(self, *args, **kwargs): "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take " "instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.take(*args, **kwargs) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index a0902a5fb32fe..104bce0369d37 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -36,6 +36,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -403,7 +404,7 @@ def is_type_compatible(self, kind: str) -> bool: f"{type(self).__name__}.is_type_compatible is deprecated and will be " "removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return kind in self._data._infer_matches diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6078da3bedd8c..e283509206344 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -495,7 +495,7 @@ def to_series(self, keep_tz=lib.no_default, index=None, name=None): "is deprecated and will be removed in a future version. " "You can stop passing 'keep_tz' to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: warnings.warn( @@ -505,7 +505,7 @@ def to_series(self, keep_tz=lib.no_default, index=None, name=None): "can do 'idx.tz_convert(None)' before calling " "'to_series'.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: keep_tz = True @@ -752,7 +752,7 @@ def check_str_or_none(point): "with non-existing keys is deprecated and will raise a " "KeyError in a future Version.", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) indexer = mask.nonzero()[0][::step] if len(indexer) == len(self): @@ -1042,7 +1042,7 @@ def date_range( warnings.warn( "Argument `closed` is deprecated in favor of `inclusive`.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if closed is None: inclusive = "both" diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fe97d61be7548..128aa8e282a0d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -46,6 +46,7 @@ deprecate_nonkeyword_arguments, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.dtypes.common import ( @@ -893,7 +894,7 @@ def set_levels( warnings.warn( "inplace is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) else: inplace = False @@ -1054,7 +1055,7 @@ def set_codes(self, codes, level=None, inplace=None, verify_integrity: bool = Tr warnings.warn( "inplace is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) else: inplace = False @@ -1166,14 +1167,14 @@ def copy( "parameter levels is deprecated and will be removed in a future " "version. Use the set_levels method instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if codes is not None: warnings.warn( "parameter codes is deprecated and will be removed in a future " "version. Use the set_codes method instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if deep: @@ -1202,7 +1203,7 @@ def copy( "parameter dtype is deprecated and will be removed in a future " "version. Use the astype method instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) new_index = new_index.astype(dtype) return new_index @@ -1802,7 +1803,7 @@ def is_lexsorted(self) -> bool: "MultiIndex.is_lexsorted is deprecated as a public function, " "users should use MultiIndex.is_monotonic_increasing instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self._is_lexsorted() @@ -1846,7 +1847,7 @@ def lexsort_depth(self) -> int: "MultiIndex.is_lexsorted is deprecated as a public function, " "users should use MultiIndex.is_monotonic_increasing instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self._lexsort_depth @@ -2212,7 +2213,7 @@ def drop(self, codes, level=None, errors="raise"): "dropping on a non-lexsorted multi-index " "without a level parameter may impact performance.", PerformanceWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) loc = loc.nonzero()[0] inds.extend(loc) @@ -2877,7 +2878,7 @@ def _maybe_to_slice(loc): warnings.warn( "indexing past lexsort depth may impact performance.", PerformanceWarning, - stacklevel=10, + stacklevel=find_stack_level(), ) loc = np.arange(start, stop, dtype=np.intp) @@ -3335,7 +3336,7 @@ def _update_indexer(idxr: Index, indexer: Index) -> Index: # TODO: how to handle IntervalIndex level? # (no test cases) FutureWarning, - stacklevel=7, + stacklevel=find_stack_level(), ) continue else: diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 4d8c411478993..25b43c556b812 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -21,6 +21,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -421,7 +422,7 @@ def asi8(self) -> npt.NDArray[np.int64]: warnings.warn( "Index.asi8 is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self._values.view(self._default_dtype) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index fd5b5bb7396af..23851eff252b4 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -25,6 +25,7 @@ DtypeObj, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_datetime64_any_dtype, @@ -346,7 +347,7 @@ def astype(self, dtype, copy: bool = True, how=lib.no_default): "will be removed in a future version. " "Use index.to_timestamp(how=how) instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: how = "start" diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index aed7a7a467db3..fdb1ee754a7e6 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -29,6 +29,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_platform_int, @@ -256,7 +257,7 @@ def _start(self) -> int: warnings.warn( self._deprecation_message.format("_start", "start"), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.start @@ -279,7 +280,7 @@ def _stop(self) -> int: warnings.warn( self._deprecation_message.format("_stop", "stop"), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.stop @@ -303,7 +304,7 @@ def _step(self) -> int: warnings.warn( self._deprecation_message.format("_step", "step"), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.step @@ -456,7 +457,7 @@ def copy( "parameter dtype is deprecated and will be removed in a future " "version. Use the astype method instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) new_index = new_index.astype(dtype) return new_index diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 669274e034905..e773bf5ffb7f4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -16,6 +16,7 @@ InvalidIndexError, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_array_like, @@ -1381,7 +1382,7 @@ def _has_valid_setitem_indexer(self, indexer) -> bool: "a future version.\n" "consider using .loc with a DataFrame indexer for automatic alignment.", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) if not isinstance(indexer, tuple): @@ -2298,7 +2299,7 @@ def convert_to_index_sliceable(obj: DataFrame, key): "and will be removed in a future version. Use `frame.loc[string]` " "instead.", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) return res except (KeyError, ValueError, NotImplementedError): diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 6cbaae3fe12e0..75715bdc90003 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -44,12 +44,14 @@ def __getattr__(name: str): import warnings + from pandas.util._exceptions import find_stack_level + if name == "CategoricalBlock": warnings.warn( "CategoricalBlock is deprecated and will be removed in a future version. " "Use ExtensionBlock instead.", DeprecationWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) from pandas.core.internals.blocks import CategoricalBlock diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 66a40b962e183..55e5b0d0439fa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -190,7 +190,7 @@ def is_categorical(self) -> bool: "future version. Use isinstance(block.values, Categorical) " "instead. See https://github.com/pandas-dev/pandas/issues/40226", DeprecationWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return isinstance(self.values, Categorical) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 159c20382dcfb..e6d6b561803d6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -23,6 +23,7 @@ DtypeObj, Manager, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -830,7 +831,7 @@ def to_arrays( "To retain the old behavior, pass as a dictionary " "DataFrame({col: categorical, ..})", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) if columns is None: columns = default_index(len(data)) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b4d6e0ace4223..cb0c3e05e955f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1192,7 +1192,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: "Consider joining all columns at once using pd.concat(axis=1) " "instead. To get a de-fragmented frame, use `newframe = frame.copy()`", PerformanceWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) def _insert_update_mgr_locs(self, loc) -> None: @@ -1637,7 +1637,7 @@ def __init__( "The `fastpath` keyword is deprecated and will be removed " "in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) self.axes = [axis] diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index ece5b21fa2f8e..540a557f7c7cc 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -14,6 +14,7 @@ from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas._typing import Level from pandas.util._decorators import Appender +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_array_like, @@ -300,7 +301,7 @@ def to_series(right): "Do `left, right = left.align(right, axis=1, copy=False)` " "before e.g. `left == right`", FutureWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) left, right = left.align( diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 1b217a592987f..7026e470df1c0 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -10,6 +10,7 @@ Appender, deprecate_kwarg, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_extension_array_dtype, @@ -58,7 +59,7 @@ def melt( "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) if id_vars is not None: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a88d1dce693f6..4dd15dd367581 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -35,6 +35,7 @@ Appender, Substitution, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( @@ -676,7 +677,7 @@ def __init__( ) # stacklevel chosen to be correct when this is reached via pd.merge # (and not DataFrame.join) - warnings.warn(msg, FutureWarning, stacklevel=3) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) self._validate_specification() @@ -2297,7 +2298,7 @@ def _items_overlap_with_suffix( "unexpected results. Provide 'suffixes' as a tuple instead. In the " "future a 'TypeError' will be raised.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) to_rename = left.intersection(right) @@ -2347,7 +2348,7 @@ def renamer(x, suffix): f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the " f"result is deprecated and will raise a MergeError in a future version.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) return llabels, rlabels diff --git a/pandas/core/series.py b/pandas/core/series.py index 996af80139458..b3c9167bfbbab 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,6 +54,7 @@ deprecate_nonkeyword_arguments, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -360,7 +361,7 @@ def __init__( "of 'float64' in a future version. Specify a dtype explicitly " "to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) # uncomment the line below when removing the FutureWarning # dtype = np.dtype(object) @@ -886,7 +887,7 @@ def take(self, indices, axis=0, is_copy=None, **kwargs) -> Series: "is_copy is deprecated and will be removed in a future version. " "'take' always returns a copy, so there is no need to specify this.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) nv.validate_take((), kwargs) @@ -1078,7 +1079,7 @@ def __setitem__(self, key, value) -> None: "Series. Use `series.iloc[an_int] = val` to treat the " "key as positional.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) # this is equivalent to self._values[key] = value self._mgr.setitem_inplace(key, value) @@ -1887,7 +1888,7 @@ def groupby( "will be removed in a future version." ), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) else: squeeze = False @@ -1949,7 +1950,7 @@ def count(self, level=None): "deprecated and will be removed in a future version. Use groupby " "instead. ser.count(level=1) should use ser.groupby(level=1).count().", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if not isinstance(self.index, MultiIndex): raise ValueError("Series.count level is only valid with a MultiIndex") @@ -5135,7 +5136,7 @@ def between(self, left, right, inclusive="both") -> Series: "Boolean inputs to the `inclusive` argument are deprecated in " "favour of `both` or `neither`.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if inclusive: inclusive = "both" diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1e27febab2af9..f82e1aa5d188c 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -19,6 +19,7 @@ F, ) from pandas.util._decorators import Appender +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_object, @@ -238,7 +239,7 @@ def __iter__(self): warnings.warn( "Columnar iteration over characters will be deprecated in future releases.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) i = 0 g = self.get(i) @@ -1214,7 +1215,7 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True): "This pattern has match groups. To actually get the " "groups, use str.extract.", UserWarning, - stacklevel=3, + stacklevel=find_stack_level(), ) result = self._data.array._str_contains(pat, case, flags, na, regex) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 669a39fcb3a74..67a6975c21fdd 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -39,6 +39,7 @@ ArrayLike, Timezone, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_object, @@ -1109,7 +1110,7 @@ def to_time(arg, format=None, infer_time_format=False, errors="raise"): "`to_time` has been moved, should be imported from pandas.core.tools.times. " "This alias will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) from pandas.core.tools.times import to_time diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index c17af442fe2cc..f5f681d9de797 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -640,7 +640,7 @@ def vol(self, bias: bool = False, *args, **kwargs): "Use std instead." ), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.std(bias, *args, **kwargs) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b04aab3755b91..f7799912937b7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -167,7 +167,7 @@ def win_type(self): "win_type will no longer return 'freq' in a future version. " "Check the type of self.window instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return "freq" return self._win_type @@ -177,7 +177,7 @@ def is_datetimelike(self) -> bool: warnings.warn( "is_datetimelike is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self._win_freq_i8 is not None @@ -185,7 +185,7 @@ def validate(self) -> None: warnings.warn( "validate is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self._validate() @@ -1763,6 +1763,7 @@ def count(self): "Specify min_periods=0 instead." ), FutureWarning, + stacklevel=find_stack_level(), ) self.min_periods = 0 result = super().count() From 085acb68cd1782d5a0a7bab4d0ccf84c96d87e79 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 12 Nov 2021 15:55:43 +0100 Subject: [PATCH 27/53] TST: Add nulls fixture to duplicates categorical na test (#44407) --- pandas/tests/series/methods/test_drop_duplicates.py | 7 ++++--- pandas/tests/series/methods/test_duplicated.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index f72d85337df8e..8b5557ab6e85f 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -2,7 +2,6 @@ import pytest from pandas import ( - NA, Categorical, Series, ) @@ -225,11 +224,13 @@ def test_drop_duplicates_categorical_bool(self, ordered): assert return_value is None tm.assert_series_equal(sc, tc[~expected]) - def test_drop_duplicates_categorical_bool_na(self): + def test_drop_duplicates_categorical_bool_na(self, nulls_fixture): # GH#44351 ser = Series( Categorical( - [True, False, True, False, NA], categories=[True, False], ordered=True + [True, False, True, False, nulls_fixture], + categories=[True, False], + ordered=True, ) ) result = ser.drop_duplicates() diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index c61492168da63..1c547ee99efed 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -2,7 +2,6 @@ import pytest from pandas import ( - NA, Categorical, Series, ) @@ -39,11 +38,13 @@ def test_duplicated_nan_none(keep, expected): tm.assert_series_equal(result, expected) -def test_duplicated_categorical_bool_na(): +def test_duplicated_categorical_bool_na(nulls_fixture): # GH#44351 ser = Series( Categorical( - [True, False, True, False, NA], categories=[True, False], ordered=True + [True, False, True, False, nulls_fixture], + categories=[True, False], + ordered=True, ) ) result = ser.duplicated() From a0b00b8ed8690b1433c3e5eff94985c2dc6cd4f3 Mon Sep 17 00:00:00 2001 From: realead Date: Fri, 12 Nov 2021 15:57:13 +0100 Subject: [PATCH 28/53] [BUG] don't mangle null-objects in value_counts (#42743) --- asv_bench/benchmarks/series_methods.py | 24 +++++++++++++++++ doc/source/whatsnew/v1.4.0.rst | 32 +++++++++++++++++++++++ pandas/_libs/hashtable_func_helper.pxi.in | 9 ++----- pandas/tests/base/test_value_counts.py | 2 +- pandas/tests/indexing/test_indexing.py | 8 +++--- pandas/tests/libs/test_hashtable.py | 10 +++---- 6 files changed, 67 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 155dd6f8e13a0..d8578ed604ae3 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -152,6 +152,18 @@ def time_value_counts(self, N, dtype): self.s.value_counts() +class ValueCountsObjectDropNAFalse: + + params = [10 ** 3, 10 ** 4, 10 ** 5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_value_counts(self, N): + self.s.value_counts(dropna=False) + + class Mode: params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]] @@ -164,6 +176,18 @@ def time_mode(self, N, dtype): self.s.mode() +class ModeObjectDropNAFalse: + + params = [10 ** 3, 10 ** 4, 10 ** 5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_mode(self, N): + self.s.mode(dropna=False) + + class Dir: def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d1e209adb1b8f..8db9be21ca4ef 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -240,6 +240,38 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is *New behavior*: +.. ipython:: python + + res + +.. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerse_to_nan: + +Null-values are no longer coerced to NaN-value in value_counts and mode +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`). + +.. ipython:: python + + s = pd.Series([True, None, pd.NaT, None, pd.NaT, None]) + res = s.value_counts(dropna=False) + +Previously, all null-values were replaced by a NaN-value. + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: res + Out[3]: + NaN 5 + True 1 + dtype: int64 + +Now null-values are no longer mangled. + +*New behavior*: + .. ipython:: python res diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index fb8ce79a924a4..e5e64f8dc7b5f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -31,7 +31,7 @@ dtypes = [('Complex128', 'complex128', 'complex128', @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): +cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} @@ -42,7 +42,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): # Don't use Py_ssize_t, since table.n_buckets is unsigned khiter_t k - bint is_null {{c_type}} val @@ -61,11 +60,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): for i in range(n): val = values[i] - is_null = checknull(val) - if not is_null or not dropna: - # all nas become the same representative: - if is_null: - val = navalue + if not dropna or not checknull(val): k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 5431baf493260..23bb4c5d2670c 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -281,5 +281,5 @@ def test_value_counts_with_nan(dropna, index_or_series): if dropna is True: expected = Series([1], index=[True]) else: - expected = Series([2, 1], index=[pd.NA, True]) + expected = Series([1, 1, 1], index=[True, pd.NA, np.nan]) tm.assert_series_equal(res, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a10288b2091ca..7c7e9f79a77ae 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -786,12 +786,12 @@ def test_no_reference_cycle(self): del df assert wr() is None - def test_label_indexing_on_nan(self): + def test_label_indexing_on_nan(self, nulls_fixture): # GH 32431 - df = Series([1, "{1,2}", 1, None]) + df = Series([1, "{1,2}", 1, nulls_fixture]) vc = df.value_counts(dropna=False) - result1 = vc.loc[np.nan] - result2 = vc[np.nan] + result1 = vc.loc[nulls_fixture] + result2 = vc[nulls_fixture] expected = 1 assert result1 == expected diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index bdc02ff0aa7a8..937eccf7a0afe 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -453,13 +453,11 @@ def test_mode_stable(self, dtype, writable): def test_modes_with_nans(): - # GH39007 - values = np.array([True, pd.NA, np.nan], dtype=np.object_) - # pd.Na and np.nan will have the same representative: np.nan - # thus we have 2 nans and 1 True + # GH42688, nans aren't mangled + nulls = [pd.NA, np.nan, pd.NaT, None] + values = np.array([True] + nulls * 2, dtype=np.object_) modes = ht.mode(values, False) - assert modes.size == 1 - assert np.isnan(modes[0]) + assert modes.size == len(nulls) def test_unique_label_indices_intp(writable): From 0010c6d405160e0998a0c00e854ffe6a2b3d6f06 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 12 Nov 2021 16:28:43 -0800 Subject: [PATCH 29/53] BUG: frame.loc[2:, 'z'] not setting inplace when multi-block (#44345) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/indexing.py | 17 +++++++--- pandas/io/stata.py | 7 ++-- pandas/tests/frame/indexing/test_setitem.py | 36 ++++++++++++++++----- pandas/tests/frame/indexing/test_xs.py | 15 +++++---- pandas/tests/frame/test_reductions.py | 12 ++++++- 6 files changed, 65 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8db9be21ca4ef..ee1dd58149451 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -575,6 +575,7 @@ Indexing - Bug when setting string-backed :class:`Categorical` values that can be parsed to datetimes into a :class:`DatetimeArray` or :class:`Series` or :class:`DataFrame` column backed by :class:`DatetimeArray` failing to parse these strings (:issue:`44236`) - Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`44261`) - Bug in :meth:`Series.__setitem__` with a boolean mask indexer setting a listlike value of length 1 incorrectly broadcasting that value (:issue:`44265`) +- Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`) - Missing diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e773bf5ffb7f4..91f1415178471 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1860,10 +1860,19 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # in case of slice ser = value[pi] else: - # set the item, possibly having a dtype change - ser = ser.copy() - ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value) - ser._maybe_update_cacher(clear=True, inplace=True) + # set the item, first attempting to operate inplace, then + # falling back to casting if necessary; see + # _whatsnew_130.notable_bug_fixes.setitem_column_try_inplace + + orig_values = ser._values + ser._mgr = ser._mgr.setitem((pi,), value) + + if ser._values is orig_values: + # The setitem happened inplace, so the DataFrame's values + # were modified inplace. + return + self.obj._iset_item(loc, ser, inplace=True) + return # reset the sliced object if unique self.obj._iset_item(loc, ser, inplace=True) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index f6c93e6f751c8..9803a2e4e3309 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -600,6 +600,8 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: # Cast from unsupported types to supported types is_nullable_int = isinstance(data[col].dtype, (_IntegerDtype, BooleanDtype)) orig = data[col] + # We need to find orig_missing before altering data below + orig_missing = orig.isna() if is_nullable_int: missing_loc = data[col].isna() if missing_loc.any(): @@ -650,11 +652,10 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: f"supported by Stata ({float64_max})" ) if is_nullable_int: - missing = orig.isna() - if missing.any(): + if orig_missing.any(): # Replace missing by Stata sentinel value sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name] - data.loc[missing, col] = sentinel + data.loc[orig_missing, col] = sentinel if ws: warnings.warn(ws, PossiblePrecisionLoss) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d735f0dbec8a5..389bf56ab6035 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -384,7 +384,7 @@ def test_setitem_frame_length_0_str_key(self, indexer): expected["A"] = expected["A"].astype("object") tm.assert_frame_equal(df, expected) - def test_setitem_frame_duplicate_columns(self, using_array_manager): + def test_setitem_frame_duplicate_columns(self, using_array_manager, request): # GH#15695 cols = ["A", "B", "C"] * 2 df = DataFrame(index=range(3), columns=cols) @@ -407,6 +407,11 @@ def test_setitem_frame_duplicate_columns(self, using_array_manager): expected["C"] = expected["C"].astype("int64") # TODO(ArrayManager) .loc still overwrites expected["B"] = expected["B"].astype("int64") + + mark = pytest.mark.xfail( + reason="Both 'A' columns get set with 3 instead of 0 and 3" + ) + request.node.add_marker(mark) else: # set these with unique columns to be extra-unambiguous expected[2] = expected[2].astype(np.int64) @@ -995,22 +1000,37 @@ def test_setitem_always_copy(self, float_frame): float_frame["E"][5:10] = np.nan assert notna(s[5:10]).all() - def test_setitem_clear_caches(self): - # see GH#304 + @pytest.mark.parametrize("consolidate", [True, False]) + def test_setitem_partial_column_inplace(self, consolidate, using_array_manager): + # This setting should be in-place, regardless of whether frame is + # single-block or multi-block + # GH#304 this used to be incorrectly not-inplace, in which case + # we needed to ensure _item_cache was cleared. + df = DataFrame( {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] ) df.insert(2, "z", np.nan) + if not using_array_manager: + if consolidate: + df._consolidate_inplace() + assert len(df._mgr.blocks) == 1 + else: + assert len(df._mgr.blocks) == 2 - # cache it - foo = df["z"] - df.loc[df.index[2:], "z"] = 42 + zvals = df["z"]._values - expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") + df.loc[2:, "z"] = 42 - assert df["z"] is not foo + expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") tm.assert_series_equal(df["z"], expected) + # check setting occurred in-place + tm.assert_numpy_array_equal(zvals, expected.values) + assert np.shares_memory(zvals, df["z"]._values) + if not consolidate: + assert df["z"]._values is zvals + def test_setitem_duplicate_columns_not_inplace(self): # GH#39510 cols = ["A", "B"] * 2 diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index d2704876c31c5..c6938abb57d64 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -366,12 +366,7 @@ def test_xs_droplevel_false_view(self, using_array_manager): assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) # modifying original df also modifies result when having a single block df.iloc[0, 0] = 2 - if not using_array_manager: - expected = DataFrame({"a": [2]}) - else: - # TODO(ArrayManager) iloc does not update the array inplace using - # "split" path - expected = DataFrame({"a": [1]}) + expected = DataFrame({"a": [2]}) tm.assert_frame_equal(result, expected) # with mixed dataframe, modifying the parent doesn't modify result @@ -379,7 +374,13 @@ def test_xs_droplevel_false_view(self, using_array_manager): df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) df.iloc[0, 0] = 2 - expected = DataFrame({"a": [1]}) + if using_array_manager: + # Here the behavior is consistent + expected = DataFrame({"a": [2]}) + else: + # FIXME: iloc does not update the array inplace using + # "split" path + expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) def test_xs_list_indexer_droplevel_false(self): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 919d8ab14778e..fc2c138538ac9 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -789,6 +789,10 @@ def test_std_timedelta64_skipna_false(self): # GH#37392 tdi = pd.timedelta_range("1 Day", periods=10) df = DataFrame({"A": tdi, "B": tdi}) + # Copy is needed for ArrayManager case, otherwise setting df.iloc + # below edits tdi, alterting both df['A'] and df['B'] + # FIXME: passing copy=True to constructor does not fix this + df = df.copy() df.iloc[-2, -1] = pd.NaT result = df.std(skipna=False) @@ -1017,7 +1021,9 @@ def test_idxmax_mixed_dtype(self): # don't cast to object, which would raise in nanops dti = date_range("2016-01-01", periods=3) - df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti}) + # Copying dti is needed for ArrayManager otherwise when we set + # df.loc[0, 3] = pd.NaT below it edits dti + df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)}) result = df.idxmax() expected = Series([1, 0, 2], index=[1, 2, 3]) @@ -1074,6 +1080,10 @@ def test_idxmax_idxmin_convert_dtypes(self, op, expected_value): def test_idxmax_dt64_multicolumn_axis1(self): dti = date_range("2016-01-01", periods=3) df = DataFrame({3: dti, 4: dti[::-1]}) + # FIXME: copy needed for ArrayManager, otherwise setting with iloc + # below also sets df.iloc[-1, 1]; passing copy=True to DataFrame + # does not solve this. + df = df.copy() df.iloc[0, 0] = pd.NaT df._consolidate_inplace() From f1f7a7527b36bb1b6b40b90e5bd42223ad4fa54e Mon Sep 17 00:00:00 2001 From: Matt Richards <45483497+m-richards@users.noreply.github.com> Date: Sun, 14 Nov 2021 03:03:30 +1000 Subject: [PATCH 30/53] DataFrame.convert_dtypes doesn't preserve subclasses (#44249) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/generic.py | 7 ++++++- pandas/tests/frame/test_subclass.py | 19 +++++++++++++++++++ pandas/tests/generic/test_finalize.py | 5 +---- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ee1dd58149451..560c3fad59e5e 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -538,6 +538,7 @@ Conversion - Bug in :class:`Series` constructor returning 0 for missing values with dtype ``int64`` and ``False`` for dtype ``bool`` (:issue:`43017`, :issue:`43018`) - Bug in :class:`IntegerDtype` not allowing coercion from string dtype (:issue:`25472`) - Bug in :func:`to_datetime` with ``arg:xr.DataArray`` and ``unit="ns"`` specified raises TypeError (:issue:`44053`) +- Bug in :meth:`DataFrame.convert_dtypes` not returning the correct type when a subclass does not overload :meth:`_constructor_sliced` (:issue:`43201`) - Strings diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 23608cf0192df..6b51456006021 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -18,6 +18,7 @@ Literal, Mapping, Sequence, + Type, cast, final, overload, @@ -6219,8 +6220,12 @@ def convert_dtypes( for col_name, col in self.items() ] if len(results) > 0: + result = concat(results, axis=1, copy=False) + cons = cast(Type["DataFrame"], self._constructor) + result = cons(result) + result = result.__finalize__(self, method="convert_dtypes") # https://github.com/python/mypy/issues/8354 - return cast(NDFrameT, concat(results, axis=1, copy=False)) + return cast(NDFrameT, result) else: return self.copy() diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 42474ff00ad6d..8d9957b24300f 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -13,6 +13,16 @@ import pandas._testing as tm +@pytest.fixture() +def gpd_style_subclass_df(): + class SubclassedDataFrame(DataFrame): + @property + def _constructor(self): + return SubclassedDataFrame + + return SubclassedDataFrame({"a": [1, 2, 3]}) + + class TestDataFrameSubclassing: def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it @@ -704,6 +714,15 @@ def test_idxmax_preserves_subclass(self): result = df.idxmax() assert isinstance(result, tm.SubclassedSeries) + def test_convert_dtypes_preserves_subclass(self, gpd_style_subclass_df): + # GH 43668 + df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + result = df.convert_dtypes() + assert isinstance(result, tm.SubclassedDataFrame) + + result = gpd_style_subclass_df.convert_dtypes() + assert isinstance(result, type(gpd_style_subclass_df)) + def test_equals_subclass(self): # https://github.com/pandas-dev/pandas/pull/34402 # allow subclass in both directions diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index c1f8b5dd7cf41..135e8cc7b7aba 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -347,10 +347,7 @@ operator.methodcaller("infer_objects"), ), (pd.Series, ([1, 2],), operator.methodcaller("convert_dtypes")), - pytest.param( - (pd.DataFrame, frame_data, operator.methodcaller("convert_dtypes")), - marks=not_implemented_mark, - ), + (pd.DataFrame, frame_data, operator.methodcaller("convert_dtypes")), (pd.Series, ([1, None, 3],), operator.methodcaller("interpolate")), (pd.DataFrame, ({"A": [1, None, 3]},), operator.methodcaller("interpolate")), (pd.Series, ([1, 2],), operator.methodcaller("clip", lower=1)), From b94826094c100fc0d865d842a2fce488b1290a79 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 13 Nov 2021 12:04:23 -0500 Subject: [PATCH 31/53] ENH: Use find_stack_level (#44416) --- pandas/_testing/asserters.py | 11 ++++++----- pandas/io/common.py | 3 ++- pandas/io/date_converters.py | 9 +++++---- pandas/io/excel/_base.py | 4 ++-- pandas/io/parsers/base_parser.py | 5 +++-- pandas/io/parsers/c_parser_wrapper.py | 3 ++- pandas/io/parsers/python_parser.py | 3 ++- pandas/io/parsers/readers.py | 6 ++++-- pandas/io/pytables.py | 7 +++++-- pandas/io/sql.py | 7 ++++--- pandas/plotting/_matplotlib/tools.py | 6 ++++-- pandas/tseries/frequencies.py | 3 ++- pandas/util/_validators.py | 4 +++- pandas/util/testing.py | 4 +++- 14 files changed, 47 insertions(+), 28 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index c9f7fd43c1050..05cd3a3a72257 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -11,6 +11,7 @@ ) from pandas._libs.missing import is_matching_na import pandas._libs.testing as _testing +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool, @@ -106,7 +107,7 @@ def assert_almost_equal( "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) # https://github.com/python/mypy/issues/7642 # error: Argument 1 to "_get_tol_from_less_precise" has incompatible @@ -340,7 +341,7 @@ def _get_ilevel_values(index, level): "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) # https://github.com/python/mypy/issues/7642 # error: Argument 1 to "_get_tol_from_less_precise" has incompatible @@ -818,7 +819,7 @@ def assert_extension_array_equal( "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) rtol = atol = _get_tol_from_less_precise(check_less_precise) @@ -964,7 +965,7 @@ def assert_series_equal( "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) rtol = atol = _get_tol_from_less_precise(check_less_precise) @@ -1247,7 +1248,7 @@ def assert_frame_equal( "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) rtol = atol = _get_tol_from_less_precise(check_less_precise) diff --git a/pandas/io/common.py b/pandas/io/common.py index be6577e646ac3..12c7afc8ee2e4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -49,6 +49,7 @@ import_lzma, ) from pandas.compat._optional import import_optional_dependency +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_file_like @@ -270,7 +271,7 @@ def _get_filepath_or_buffer( warnings.warn( "compression has no effect when passing a non-binary object as input.", RuntimeWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) compression_method = None diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index f079a25f69fec..ef60afa195234 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -4,6 +4,7 @@ import numpy as np from pandas._libs.tslibs import parsing +from pandas.util._exceptions import find_stack_level def parse_date_time(date_col, time_col): @@ -18,7 +19,7 @@ def parse_date_time(date_col, time_col): Use pd.to_datetime(date_col + " " + time_col).to_pydatetime() instead to get a Numpy array. """, # noqa: E501 FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) @@ -38,7 +39,7 @@ def parse_date_fields(year_col, month_col, day_col): np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. """, # noqa: E501 FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) year_col = _maybe_cast(year_col) @@ -63,7 +64,7 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_ np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. """, # noqa: E501 FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) year_col = _maybe_cast(year_col) @@ -89,7 +90,7 @@ def generic_parser(parse_func, *cols): Use pd.to_datetime instead. """, FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) N = _check_columns(cols) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e543c9161a26e..1caf334f9607e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -833,7 +833,7 @@ def __new__( warnings.warn( "Use of **kwargs is deprecated, use engine_kwargs instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) # only switch class if generic(ExcelWriter) @@ -868,7 +868,7 @@ def __new__( "deprecated and will also raise a warning, it can " "be globally set and the warning suppressed.", FutureWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) cls = get_writer(engine) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 339585810bec1..6374f52f6964b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -32,6 +32,7 @@ ParserError, ParserWarning, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -558,7 +559,7 @@ def _convert_to_ndarrays( f"for column {c} - only the converter will be used." ), ParserWarning, - stacklevel=7, + stacklevel=find_stack_level(), ) try: @@ -830,7 +831,7 @@ def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: "Length of header or names does not match length of data. This leads " "to a loss of data with index_col=False.", ParserWarning, - stacklevel=6, + stacklevel=find_stack_level(), ) def _evaluate_usecols(self, usecols, names): diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 352dd998dda0f..db750cded45e5 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -10,6 +10,7 @@ FilePathOrBuffer, ) from pandas.errors import DtypeWarning +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -387,7 +388,7 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: f"Specify dtype option on import or set low_memory=False." ] ) - warnings.warn(warning_message, DtypeWarning, stacklevel=8) + warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level()) return result diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index b0e868b260369..4d596aa2f3fa6 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -24,6 +24,7 @@ EmptyDataError, ParserError, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.inference import is_dict_like @@ -555,7 +556,7 @@ def _handle_usecols( "Defining usecols with out of bounds indices is deprecated " "and will raise a ParserError in a future version.", FutureWarning, - stacklevel=8, + stacklevel=find_stack_level(), ) col_indices = self.usecols diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6d3cc84a31d05..6fb9497dbc1d6 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1041,7 +1041,7 @@ def _clean_options(self, options, engine): "engine='python'." ), ParserWarning, - stacklevel=5, + stacklevel=find_stack_level(), ) index_col = options["index_col"] @@ -1573,7 +1573,9 @@ def _merge_with_dialect_properties( conflict_msgs.append(msg) if conflict_msgs: - warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2) + warnings.warn( + "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level() + ) kwds[param] = dialect_val return kwds diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8c8e9b9feeb80..0e886befb5f2f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -45,6 +45,7 @@ from pandas.compat.pickle_compat import patch_pickle from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_object, @@ -2190,7 +2191,9 @@ def update_info(self, info): # frequency/name just warn if key in ["freq", "index_name"]: ws = attribute_conflict_doc % (key, existing_value, value) - warnings.warn(ws, AttributeConflictWarning, stacklevel=6) + warnings.warn( + ws, AttributeConflictWarning, stacklevel=find_stack_level() + ) # reset idx[key] = None @@ -3080,7 +3083,7 @@ def write_array( pass else: ws = performance_doc % (inferred_type, key, items) - warnings.warn(ws, PerformanceWarning, stacklevel=7) + warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level()) vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ec5262ee3a04c..867ce52cbde6f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -28,6 +28,7 @@ from pandas._typing import DtypeArg from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -1159,7 +1160,7 @@ def _sqlalchemy_type(self, col): "the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the database.", UserWarning, - stacklevel=8, + stacklevel=find_stack_level(), ) return BigInteger elif col_type == "floating": @@ -1886,7 +1887,7 @@ def _create_table_setup(self): pat = re.compile(r"\s+") column_names = [col_name for col_name, _, _ in column_names_and_types] if any(map(pat.search, column_names)): - warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6) + warnings.warn(_SAFE_NAMES_WARNING, stacklevel=find_stack_level()) escape = _get_valid_sqlite_name @@ -1948,7 +1949,7 @@ def _sql_type_name(self, col): "the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the database.", UserWarning, - stacklevel=8, + stacklevel=find_stack_level(), ) col_type = "integer" diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 9679e79d8c4ba..5314a61191d78 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -13,6 +13,8 @@ import matplotlib.ticker as ticker import numpy as np +from pandas.util._exceptions import find_stack_level + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -233,7 +235,7 @@ def create_subplots( "When passing multiple axes, sharex and sharey " "are ignored. These settings must be specified when creating axes.", UserWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) if ax.size == naxes: fig = ax.flat[0].get_figure() @@ -256,7 +258,7 @@ def create_subplots( "To output multiple subplots, the figure containing " "the passed axes is being cleared.", UserWarning, - stacklevel=4, + stacklevel=find_stack_level(), ) fig.clear() diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index c2d7f7b3f716c..fc01771507888 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -29,6 +29,7 @@ from pandas._libs.tslibs.parsing import get_rule_month from pandas._typing import npt from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_datetime64_dtype, @@ -116,7 +117,7 @@ def get_offset(name: str) -> DateOffset: "get_offset is deprecated and will be removed in a future version, " "use to_offset instead.", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return _get_offset(name) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index f8bd1ec7bc96a..ee54b1b2074cb 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -12,6 +12,8 @@ import numpy as np +from pandas.util._exceptions import find_stack_level + from pandas.core.dtypes.common import ( is_bool, is_integer, @@ -339,7 +341,7 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): "positional arguments for 'index' or 'columns' will raise " "a 'TypeError'." ) - warnings.warn(msg, FutureWarning, stacklevel=4) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) out[data._get_axis_name(0)] = args[0] out[data._get_axis_name(1)] = args[1] else: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index af9fe4846b27d..0ab59a202149d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1,5 +1,7 @@ import warnings +from pandas.util._exceptions import find_stack_level + from pandas._testing import * # noqa warnings.warn( @@ -8,5 +10,5 @@ "public API at pandas.testing instead." ), FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) From cda4544c8053555dca151616dcbaa34f821ea7a5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 09:05:21 -0800 Subject: [PATCH 32/53] TST: de-duplicate assert_slics_equivalent (#44415) --- pandas/_testing/__init__.py | 1 + pandas/_testing/asserters.py | 14 ++++++++ .../tests/indexing/multiindex/test_slice.py | 34 +++++++++---------- pandas/tests/indexing/test_indexing.py | 22 +++++------- pandas/tests/series/indexing/test_indexing.py | 21 ++++-------- 5 files changed, 47 insertions(+), 45 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index c2c55a4060f7a..4f9ef2c3c3ffa 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -82,6 +82,7 @@ assert_extension_array_equal, assert_frame_equal, assert_index_equal, + assert_indexing_slices_equivalent, assert_interval_array_equal, assert_is_sorted, assert_is_valid_plot_return_object, diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 05cd3a3a72257..54f74bd1ae107 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -1445,3 +1445,17 @@ def is_extension_array_dtype_and_needs_i8_conversion(left_dtype, right_dtype) -> Related to issue #37609 """ return is_extension_array_dtype(left_dtype) and needs_i8_conversion(right_dtype) + + +def assert_indexing_slices_equivalent(ser: Series, l_slc: slice, i_slc: slice): + """ + Check that ser.iloc[i_slc] matches ser.loc[l_slc] and, if applicable, + ser[l_slc]. + """ + expected = ser.iloc[i_slc] + + assert_series_equal(ser.loc[l_slc], expected) + + if not ser.index.is_integer(): + # For integer indices, .loc and plain getitem are position-based. + assert_series_equal(ser[l_slc], expected) diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 42edaa2fe6c3a..55d45a21d643a 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -702,32 +702,30 @@ def test_per_axis_per_level_setitem(self): tm.assert_frame_equal(df, expected) def test_multiindex_label_slicing_with_negative_step(self): - s = Series( + ser = Series( np.arange(20), MultiIndex.from_product([list("abcde"), np.arange(4)]) ) SLC = pd.IndexSlice - def assert_slices_equivalent(l_slc, i_slc): - tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + tm.assert_indexing_slices_equivalent(ser, SLC[::-1], SLC[::-1]) - assert_slices_equivalent(SLC[::-1], SLC[::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC["d"::-1], SLC[15::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",)::-1], SLC[15::-1]) - assert_slices_equivalent(SLC["d"::-1], SLC[15::-1]) - assert_slices_equivalent(SLC[("d",)::-1], SLC[15::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[:"d":-1], SLC[:11:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[:("d",):-1], SLC[:11:-1]) - assert_slices_equivalent(SLC[:"d":-1], SLC[:11:-1]) - assert_slices_equivalent(SLC[:("d",):-1], SLC[:11:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC["d":"b":-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",):"b":-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC["d":("b",):-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("d",):("b",):-1], SLC[15:3:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC["b":"d":-1], SLC[:0]) - assert_slices_equivalent(SLC["d":"b":-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[("d",):"b":-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC["d":("b",):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[("d",):("b",):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC["b":"d":-1], SLC[:0]) - - assert_slices_equivalent(SLC[("c", 2)::-1], SLC[10::-1]) - assert_slices_equivalent(SLC[:("c", 2):-1], SLC[:9:-1]) - assert_slices_equivalent(SLC[("e", 0):("c", 2):-1], SLC[16:9:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[("c", 2)::-1], SLC[10::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[:("c", 2):-1], SLC[:9:-1]) + tm.assert_indexing_slices_equivalent( + ser, SLC[("e", 0):("c", 2):-1], SLC[16:9:-1] + ) def test_multiindex_slice_first_level(self): # GH 12697 diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 7c7e9f79a77ae..2805c8877ed78 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -709,21 +709,17 @@ def run_tests(df, rhs, right_loc, right_iloc): def test_str_label_slicing_with_negative_step(self): SLC = pd.IndexSlice - def assert_slices_equivalent(l_slc, i_slc): - tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - - if not idx.is_integer: - # For integer indices, .loc and plain getitem are position-based. - tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) - tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - for idx in [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)]: idx = Index(idx) - s = Series(np.arange(20), index=idx) - assert_slices_equivalent(SLC[idx[9] :: -1], SLC[9::-1]) - assert_slices_equivalent(SLC[: idx[9] : -1], SLC[:8:-1]) - assert_slices_equivalent(SLC[idx[13] : idx[9] : -1], SLC[13:8:-1]) - assert_slices_equivalent(SLC[idx[9] : idx[13] : -1], SLC[:0]) + ser = Series(np.arange(20), index=idx) + tm.assert_indexing_slices_equivalent(ser, SLC[idx[9] :: -1], SLC[9::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[: idx[9] : -1], SLC[:8:-1]) + tm.assert_indexing_slices_equivalent( + ser, SLC[idx[13] : idx[9] : -1], SLC[13:8:-1] + ) + tm.assert_indexing_slices_equivalent( + ser, SLC[idx[9] : idx[13] : -1], SLC[:0] + ) def test_slice_with_zero_step_raises(self, indexer_sl, frame_or_series): obj = frame_or_series(np.arange(20), index=_mklbl("A", 20)) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 6c3587c7eeada..8a34882b1e5d4 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -338,26 +338,19 @@ def test_slice_with_zero_step_raises(index, frame_or_series, indexer_sli): ], ) def test_slice_with_negative_step(index): - def assert_slices_equivalent(l_slc, i_slc): - expected = ts.iloc[i_slc] - - tm.assert_series_equal(ts[l_slc], expected) - tm.assert_series_equal(ts.loc[l_slc], expected) - keystr1 = str(index[9]) keystr2 = str(index[13]) - box = type(index[0]) - ts = Series(np.arange(20), index) + ser = Series(np.arange(20), index) SLC = IndexSlice - for key in [keystr1, box(keystr1)]: - assert_slices_equivalent(SLC[key::-1], SLC[9::-1]) - assert_slices_equivalent(SLC[:key:-1], SLC[:8:-1]) + for key in [keystr1, index[9]]: + tm.assert_indexing_slices_equivalent(ser, SLC[key::-1], SLC[9::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[:key:-1], SLC[:8:-1]) - for key2 in [keystr2, box(keystr2)]: - assert_slices_equivalent(SLC[key2:key:-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC[key:key2:-1], SLC[0:0:-1]) + for key2 in [keystr2, index[13]]: + tm.assert_indexing_slices_equivalent(ser, SLC[key2:key:-1], SLC[13:8:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[key:key2:-1], SLC[0:0:-1]) def test_tuple_index(): From f537c134994909103e5699466fe2acb5f16e2d64 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 13 Nov 2021 18:05:35 +0100 Subject: [PATCH 33/53] DOC: Add how=cross description to join (#44418) --- pandas/core/frame.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b01de5dec610d..212bb63693d56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9155,6 +9155,11 @@ def join( * inner: form intersection of calling frame's index (or column if on is specified) with `other`'s index, preserving the order of the calling's one. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + + .. versionadded:: 1.2.0 + lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' From a5700de634966f3e9a99914a0767a00fd3003414 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 13 Nov 2021 12:21:03 -0500 Subject: [PATCH 34/53] DOC: whatsnew for the improvement to warning messages (#44419) --- doc/source/whatsnew/v1.4.0.rst | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 560c3fad59e5e..3834a089aea53 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -15,6 +15,31 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_140.enhancements.warning_lineno: + +Improved warning messages +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, warning messages may have pointed to lines within the pandas library. Running the script ``setting_with_copy_warning.py`` + +.. code-block:: python + + import pandas as pd + + df = pd.DataFrame({'a': [1, 2, 3]}) + df[:2].loc[:, 'a'] = 5 + +with pandas 1.3 resulted in:: + + .../site-packages/pandas/core/indexing.py:1951: SettingWithCopyWarning: + A value is trying to be set on a copy of a slice from a DataFrame. + +This made it difficult to determine where the warning was being generated from. Now pandas will inspect the call stack, reporting the first line outside of the pandas library that gave rise to the warning. The output of the above script is now:: + + setting_with_copy_warning.py:4: SettingWithCopyWarning: + A value is trying to be set on a copy of a slice from a DataFrame. + + .. _whatsnew_140.enhancements.numeric_index: More flexible numeric dtypes for indexes From 6d348550ae14eceab2a92dadbfd14feb2acc6567 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 09:21:40 -0800 Subject: [PATCH 35/53] BUG: handle NaNs in FloatingArray.equals (#44390) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/masked.py | 15 ++++++++++ .../tests/arrays/floating/test_comparison.py | 29 +++++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 3834a089aea53..59b164c156d79 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -719,6 +719,7 @@ Other - Bug in :meth:`RangeIndex.difference` with ``sort=None`` and ``step<0`` failing to sort (:issue:`44085`) - Bug in :meth:`Series.to_frame` and :meth:`Index.to_frame` ignoring the ``name`` argument when ``name=None`` is explicitly passed (:issue:`44212`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` with ``value=None`` and ExtensionDtypes (:issue:`44270`) +- Bug in :meth:`FloatingArray.equals` failing to consider two arrays equal if they contain ``np.nan`` values (:issue:`44382`) - .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index b11b11ded2f22..1797f1aff4235 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -627,6 +627,21 @@ def value_counts(self, dropna: bool = True) -> Series: return Series(counts, index=index) + @doc(ExtensionArray.equals) + def equals(self, other) -> bool: + if type(self) != type(other): + return False + if other.dtype != self.dtype: + return False + + # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT + # equal. + return np.array_equal(self._mask, other._mask) and np.array_equal( + self._data[~self._mask], + other._data[~other._mask], + equal_nan=True, + ) + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if name in {"any", "all"}: return getattr(self, name)(skipna=skipna, **kwargs) diff --git a/pandas/tests/arrays/floating/test_comparison.py b/pandas/tests/arrays/floating/test_comparison.py index c4163c25ae74d..a429649f1ce1d 100644 --- a/pandas/tests/arrays/floating/test_comparison.py +++ b/pandas/tests/arrays/floating/test_comparison.py @@ -1,7 +1,9 @@ +import numpy as np import pytest import pandas as pd import pandas._testing as tm +from pandas.core.arrays import FloatingArray from pandas.tests.arrays.masked_shared import ( ComparisonOps, NumericOps, @@ -34,3 +36,30 @@ def test_equals(): a1 = pd.array([1, 2, None], dtype="Float64") a2 = pd.array([1, 2, None], dtype="Float32") assert a1.equals(a2) is False + + +def test_equals_nan_vs_na(): + # GH#44382 + + mask = np.zeros(3, dtype=bool) + data = np.array([1.0, np.nan, 3.0], dtype=np.float64) + + left = FloatingArray(data, mask) + assert left.equals(left) + tm.assert_extension_array_equal(left, left) + + assert left.equals(left.copy()) + assert left.equals(FloatingArray(data.copy(), mask.copy())) + + mask2 = np.array([False, True, False], dtype=bool) + data2 = np.array([1.0, 2.0, 3.0], dtype=np.float64) + right = FloatingArray(data2, mask2) + assert right.equals(right) + tm.assert_extension_array_equal(right, right) + + assert not left.equals(right) + + # with mask[1] = True, the only difference is data[1], which should + # not matter for equals + mask[1] = True + assert left.equals(right) From c456969370aaec9c6c43269f3c4ff2a5c9810efa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 15:26:01 -0800 Subject: [PATCH 36/53] Fix FloatingArray.equals on older numpy (#44432) --- pandas/core/arrays/masked.py | 12 +++++++----- pandas/core/dtypes/missing.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1797f1aff4235..568f3484e78e4 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -47,6 +47,7 @@ ) from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import ( + array_equivalent, isna, notna, ) @@ -636,11 +637,12 @@ def equals(self, other) -> bool: # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT # equal. - return np.array_equal(self._mask, other._mask) and np.array_equal( - self._data[~self._mask], - other._data[~other._mask], - equal_nan=True, - ) + if not np.array_equal(self._mask, other._mask): + return False + + left = self._data[~self._mask] + right = other._data[~other._mask] + return array_equivalent(left, right, dtype_equal=True) def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if name in {"any", "all"}: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index c457b52cf4b0e..eea3fa37b7435 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -475,8 +475,8 @@ def array_equivalent( return np.array_equal(left, right) -def _array_equivalent_float(left, right): - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() +def _array_equivalent_float(left, right) -> bool: + return bool(((left == right) | (np.isnan(left) & np.isnan(right))).all()) def _array_equivalent_datetimelike(left, right): From 7483ee9435082bb2e255b3acbe47550dd315db7a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 18:05:12 -0800 Subject: [PATCH 37/53] BUG: DataFrame with mismatched NA value and dtype (#44428) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/internals/construction.py | 17 ++++++++++------- pandas/tests/frame/test_constructors.py | 9 +-------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 59b164c156d79..92fadf801cec7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -608,6 +608,7 @@ Missing ^^^^^^^ - Bug in :meth:`DataFrame.fillna` with limit and no method ignores axis='columns' or ``axis = 1`` (:issue:`40989`) - Bug in :meth:`DataFrame.fillna` not replacing missing values when using a dict-like ``value`` and duplicate column names (:issue:`43476`) +- Bug in constructing a :class:`DataFrame` with a dictionary ``np.datetime64`` as a value and ``dtype='timedelta64[ns]'``, or vice-versa, incorrectly casting instead of raising (:issue:`??`) - MultiIndex diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index e6d6b561803d6..a766f8321a641 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -443,15 +443,18 @@ def dict_to_mgr( if missing.any() and not is_integer_dtype(dtype): nan_dtype: DtypeObj - if dtype is None or ( - isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.flexible) - ): + if dtype is not None: + # calling sanitize_array ensures we don't mix-and-match + # NA dtypes + midxs = missing.values.nonzero()[0] + for i in midxs: + arr = sanitize_array(arrays.iat[i], index, dtype=dtype) + arrays.iat[i] = arr + else: # GH#1783 nan_dtype = np.dtype("object") - else: - nan_dtype = dtype - val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) - arrays.loc[missing] = [val] * missing.sum() + val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) + arrays.loc[missing] = [val] * missing.sum() arrays = list(arrays) columns = ensure_index(columns) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f92bbe1c718ab..52797862afa14 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2903,14 +2903,7 @@ def test_from_timedelta64_scalar_object(self, constructor): assert isinstance(get1(obj), np.timedelta64) @pytest.mark.parametrize("cls", [np.datetime64, np.timedelta64]) - def test_from_scalar_datetimelike_mismatched(self, constructor, cls, request): - node = request.node - params = node.callspec.params - if params["frame_or_series"] is DataFrame and params["constructor"] is dict: - mark = pytest.mark.xfail( - reason="DataFrame incorrectly allows mismatched datetimelike" - ) - node.add_marker(mark) + def test_from_scalar_datetimelike_mismatched(self, constructor, cls): scalar = cls("NaT", "ns") dtype = {np.datetime64: "m8[ns]", np.timedelta64: "M8[ns]"}[cls] From 25401fca34c5faf3b8ab0770990fa8c9bf3a9885 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 18:05:39 -0800 Subject: [PATCH 38/53] TST: collect/share Index tests (#44413) --- pandas/tests/frame/indexing/test_setitem.py | 13 ++ .../tests/indexes/base_class/test_formats.py | 14 ++ pandas/tests/indexes/common.py | 60 -------- .../tests/indexes/datetimes/test_formats.py | 17 +++ .../tests/indexes/datetimes/test_indexing.py | 63 ++------- .../tests/indexes/interval/test_indexing.py | 35 +++++ .../tests/indexes/interval/test_interval.py | 37 ----- pandas/tests/indexes/interval/test_pickle.py | 13 ++ pandas/tests/indexes/multi/test_compat.py | 7 - pandas/tests/indexes/multi/test_pickle.py | 10 ++ pandas/tests/indexes/period/test_indexing.py | 8 -- pandas/tests/indexes/test_any_index.py | 6 + pandas/tests/indexes/test_base.py | 131 +----------------- pandas/tests/indexes/test_common.py | 64 ++++++++- pandas/tests/indexes/test_index_new.py | 92 ++++++++++++ .../tests/indexes/timedeltas/test_indexing.py | 8 -- pandas/tests/indexing/test_datetime.py | 15 +- pandas/tests/indexing/test_indexing.py | 26 ++-- pandas/tests/indexing/test_scalar.py | 7 + pandas/tests/series/indexing/test_getitem.py | 6 + 20 files changed, 301 insertions(+), 331 deletions(-) create mode 100644 pandas/tests/indexes/interval/test_pickle.py create mode 100644 pandas/tests/indexes/multi/test_pickle.py diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 389bf56ab6035..bb1a1bc72116d 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -44,6 +44,19 @@ class TestDataFrameSetItem: + def test_setitem_str_subclass(self): + # GH#37366 + class mystring(str): + pass + + data = ["2020-10-22 01:21:00+00:00"] + index = DatetimeIndex(data) + df = DataFrame({"a": [1]}, index=index) + df["b"] = 2 + df[mystring("c")] = 3 + expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index) + tm.assert_equal(df, expected) + @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) def test_setitem_dtype(self, dtype, float_frame): arr = np.random.randn(len(float_frame)) diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index f07b06acbfbdb..9053d45dee623 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -122,6 +122,14 @@ def test_repr_summary(self): assert len(result) < 200 assert "..." in result + def test_summary_bug(self): + # GH#3869 + ind = Index(["{other}%s", "~:{range}:0"], name="A") + result = ind._summary() + # shouldn't be formatted accidentally. + assert "~:{range}:0" in result + assert "{other}%s" in result + def test_index_repr_bool_nan(self): # GH32146 arr = Index([True, False, np.nan], dtype=object) @@ -132,3 +140,9 @@ def test_index_repr_bool_nan(self): exp2 = repr(arr) out2 = "Index([True, False, nan], dtype='object')" assert out2 == exp2 + + def test_format_different_scalar_lengths(self): + # GH#35439 + idx = Index(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 33d2558613baf..a5ee743b5cd9a 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -69,26 +69,6 @@ def test_pickle_compat_construction(self): with pytest.raises(TypeError, match=msg): self._index_cls() - @pytest.mark.parametrize("name", [None, "new_name"]) - def test_to_frame(self, name, simple_index): - # see GH-15230, GH-22580 - idx = simple_index - - if name: - idx_name = name - else: - idx_name = idx.name or 0 - - df = idx.to_frame(name=idx_name) - - assert df.index is idx - assert len(df.columns) == 1 - assert df.columns[0] == idx_name - assert df[idx_name].values is not idx.values - - df = idx.to_frame(index=False, name=idx_name) - assert df.index is not idx - def test_shift(self, simple_index): # GH8083 test the base class for shift @@ -226,46 +206,6 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) - def test_copy_name(self, index): - # gh-12309: Check that the "name" argument - # passed at initialization is honored. - if isinstance(index, MultiIndex): - return - - first = type(index)(index, copy=True, name="mario") - second = type(first)(first, copy=False) - - # Even though "copy=False", we want a new object. - assert first is not second - - # Not using tm.assert_index_equal() since names differ. - assert index.equals(first) - - assert first.name == "mario" - assert second.name == "mario" - - s1 = Series(2, index=first) - s2 = Series(3, index=second[:-1]) - - if not isinstance(index, CategoricalIndex): - # See gh-13365 - s3 = s1 * s2 - assert s3.index.name == "mario" - - def test_copy_name2(self, index): - # gh-35592 - if isinstance(index, MultiIndex): - return - - assert index.copy(name="mario").name == "mario" - - with pytest.raises(ValueError, match="Length of new names must be 1, got 2"): - index.copy(name=["mario", "luigi"]) - - msg = f"{type(index).__name__}.name must be a hashable type" - with pytest.raises(TypeError, match=msg): - index.copy(name=[["mario"]]) - def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 36046aaeacaae..197038dbadaf7 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -254,3 +254,20 @@ def test_dti_custom_business_summary_dateutil(self): pd.bdate_range( "1/1/2005", "1/1/2009", freq="C", tz=dateutil.tz.tzutc() )._summary() + + +class TestFormat: + def test_format_with_name_time_info(self): + # bug I fixed 12/20/2011 + dates = pd.date_range("2011-01-01 04:00:00", periods=10, name="something") + + formatted = dates.format(name=True) + assert formatted[0] == "something" + + def test_format_datetime_with_time(self): + dti = DatetimeIndex([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) + + result = dti.format() + expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] + assert len(result) == 2 + assert result == expected diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index c3152b77d39df..beca71969dfcd 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -21,25 +21,12 @@ ) import pandas._testing as tm -from pandas.tseries.offsets import ( - BDay, - CDay, -) +from pandas.tseries.frequencies import to_offset START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) class TestGetItem: - def test_ellipsis(self): - # GH#21282 - idx = date_range( - "2011-01-01", "2011-01-31", freq="D", tz="Asia/Tokyo", name="idx" - ) - - result = idx[...] - assert result.equals(idx) - assert result is not idx - def test_getitem_slice_keeps_name(self): # GH4226 st = Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") @@ -88,44 +75,17 @@ def test_getitem(self): tm.assert_index_equal(result, expected) assert result.freq == expected.freq - def test_dti_business_getitem(self): - rng = bdate_range(START, END) - smaller = rng[:5] - exp = DatetimeIndex(rng.view(np.ndarray)[:5], freq="B") - tm.assert_index_equal(smaller, exp) - assert smaller.freq == exp.freq - - assert smaller.freq == rng.freq - - sliced = rng[::5] - assert sliced.freq == BDay() * 5 - - fancy_indexed = rng[[4, 3, 2, 1, 0]] - assert len(fancy_indexed) == 5 - assert isinstance(fancy_indexed, DatetimeIndex) - assert fancy_indexed.freq is None - - # 32-bit vs. 64-bit platforms - assert rng[4] == rng[np.int_(4)] - - def test_dti_business_getitem_matplotlib_hackaround(self): - rng = bdate_range(START, END) - with tm.assert_produces_warning(FutureWarning): - # GH#30588 multi-dimensional indexing deprecated - values = rng[:, None] - expected = rng.values[:, None] - tm.assert_numpy_array_equal(values, expected) - - def test_dti_custom_getitem(self): - rng = bdate_range(START, END, freq="C") + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_dti_business_getitem(self, freq): + rng = bdate_range(START, END, freq=freq) smaller = rng[:5] - exp = DatetimeIndex(rng.view(np.ndarray)[:5], freq="C") + exp = DatetimeIndex(rng.view(np.ndarray)[:5], freq=freq) tm.assert_index_equal(smaller, exp) assert smaller.freq == exp.freq assert smaller.freq == rng.freq sliced = rng[::5] - assert sliced.freq == CDay() * 5 + assert sliced.freq == to_offset(freq) * 5 fancy_indexed = rng[[4, 3, 2, 1, 0]] assert len(fancy_indexed) == 5 @@ -135,8 +95,9 @@ def test_dti_custom_getitem(self): # 32-bit vs. 64-bit platforms assert rng[4] == rng[np.int_(4)] - def test_dti_custom_getitem_matplotlib_hackaround(self): - rng = bdate_range(START, END, freq="C") + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_dti_business_getitem_matplotlib_hackaround(self, freq): + rng = bdate_range(START, END, freq=freq) with tm.assert_produces_warning(FutureWarning): # GH#30588 multi-dimensional indexing deprecated values = rng[:, None] @@ -255,6 +216,12 @@ def test_where_tz(self): class TestTake: + def test_take_nan_first_datetime(self): + index = DatetimeIndex([pd.NaT, Timestamp("20130101"), Timestamp("20130102")]) + result = index.take([-1, 0, 1]) + expected = DatetimeIndex([index[-1], index[0], index[1]]) + tm.assert_index_equal(result, expected) + def test_take(self): # GH#10295 idx1 = date_range("2011-01-01", "2011-01-31", freq="D", name="idx") diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 8df8eef69e9c9..f12f32724b9e1 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -11,6 +11,7 @@ Interval, IntervalIndex, NaT, + Series, Timedelta, date_range, timedelta_range, @@ -523,3 +524,37 @@ def test_putmask_td64(self): result = idx.putmask(mask, idx[-1]) expected = IntervalIndex([idx[-1]] * 3 + list(idx[3:])) tm.assert_index_equal(result, expected) + + +class TestGetValue: + @pytest.mark.parametrize("key", [[5], (2, 3)]) + def test_get_value_non_scalar_errors(self, key): + # GH#31117 + idx = IntervalIndex.from_tuples([(1, 3), (2, 4), (3, 5), (7, 10), (3, 10)]) + ser = Series(range(len(idx)), index=idx) + + msg = str(key) + with pytest.raises(InvalidIndexError, match=msg): + with tm.assert_produces_warning(FutureWarning): + idx.get_value(ser, key) + + +class TestContains: + # .__contains__, not .contains + + def test_contains_dunder(self): + + index = IntervalIndex.from_arrays([0, 1], [1, 2], closed="right") + + # __contains__ requires perfect matches to intervals. + assert 0 not in index + assert 1 not in index + assert 2 not in index + + assert Interval(0, 1, closed="right") in index + assert Interval(0, 2, closed="right") not in index + assert Interval(0, 0.5, closed="right") not in index + assert Interval(3, 5, closed="right") not in index + assert Interval(-1, 0, closed="left") not in index + assert Interval(0, 1, closed="left") not in index + assert Interval(0, 1, closed="both") not in index diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 321d1aa34b9af..843885832690f 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas.errors import InvalidIndexError - import pandas as pd from pandas import ( Index, @@ -500,23 +498,6 @@ def test_contains_method(self): ): i.contains(Interval(0, 1)) - def test_contains_dunder(self): - - index = IntervalIndex.from_arrays([0, 1], [1, 2], closed="right") - - # __contains__ requires perfect matches to intervals. - assert 0 not in index - assert 1 not in index - assert 2 not in index - - assert Interval(0, 1, closed="right") in index - assert Interval(0, 2, closed="right") not in index - assert Interval(0, 0.5, closed="right") not in index - assert Interval(3, 5, closed="right") not in index - assert Interval(-1, 0, closed="left") not in index - assert Interval(0, 1, closed="left") not in index - assert Interval(0, 1, closed="both") not in index - def test_dropna(self, closed): expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)], closed=closed) @@ -908,24 +889,6 @@ def test_is_all_dates(self): year_2017_index = IntervalIndex([year_2017]) assert not year_2017_index._is_all_dates - @pytest.mark.parametrize("key", [[5], (2, 3)]) - def test_get_value_non_scalar_errors(self, key): - # GH 31117 - idx = IntervalIndex.from_tuples([(1, 3), (2, 4), (3, 5), (7, 10), (3, 10)]) - s = pd.Series(range(len(idx)), index=idx) - - msg = str(key) - with pytest.raises(InvalidIndexError, match=msg): - with tm.assert_produces_warning(FutureWarning): - idx.get_value(s, key) - - @pytest.mark.parametrize("closed", ["left", "right", "both"]) - def test_pickle_round_trip_closed(self, closed): - # https://github.com/pandas-dev/pandas/issues/35658 - idx = IntervalIndex.from_tuples([(1, 2), (2, 3)], closed=closed) - result = tm.round_trip_pickle(idx) - tm.assert_index_equal(result, idx) - def test_dir(): # GH#27571 dir(interval_index) should not raise diff --git a/pandas/tests/indexes/interval/test_pickle.py b/pandas/tests/indexes/interval/test_pickle.py new file mode 100644 index 0000000000000..308a90e72eab5 --- /dev/null +++ b/pandas/tests/indexes/interval/test_pickle.py @@ -0,0 +1,13 @@ +import pytest + +from pandas import IntervalIndex +import pandas._testing as tm + + +class TestPickle: + @pytest.mark.parametrize("closed", ["left", "right", "both"]) + def test_pickle_round_trip_closed(self, closed): + # https://github.com/pandas-dev/pandas/issues/35658 + idx = IntervalIndex.from_tuples([(1, 2), (2, 3)], closed=closed) + result = tm.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index d2b5a595b8454..cbb4ae0b0d09b 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -96,10 +96,3 @@ def test_inplace_mutation_resets_values(): assert "_values" not in mi2._cache tm.assert_almost_equal(mi2.values, new_values) assert "_values" in mi2._cache - - -def test_pickle_compat_construction(): - # this is testing for pickle compat - # need an object to create with - with pytest.raises(TypeError, match="Must pass both levels and codes"): - MultiIndex() diff --git a/pandas/tests/indexes/multi/test_pickle.py b/pandas/tests/indexes/multi/test_pickle.py new file mode 100644 index 0000000000000..1d8b721404421 --- /dev/null +++ b/pandas/tests/indexes/multi/test_pickle.py @@ -0,0 +1,10 @@ +import pytest + +from pandas import MultiIndex + + +def test_pickle_compat_construction(): + # this is testing for pickle compat + # need an object to create with + with pytest.raises(TypeError, match="Must pass both levels and codes"): + MultiIndex() diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 1b5e64bca03a0..df2f114e73df2 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -52,14 +52,6 @@ def non_comparable_idx(request): class TestGetItem: - def test_ellipsis(self): - # GH#21282 - idx = period_range("2011-01-01", "2011-01-31", freq="D", name="idx") - - result = idx[...] - assert result.equals(idx) - assert result is not idx - def test_getitem_slice_keeps_name(self): idx = period_range("20010101", periods=10, freq="D", name="bob") assert idx.name == idx[1:].name diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index f7dafd78a801f..91679959e7979 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -137,6 +137,12 @@ def test_pickle_preserves_name(self, index): class TestIndexing: + def test_getitem_ellipsis(self, index): + # GH#21282 + result = index[...] + assert result.equals(index) + assert result is not index + def test_slice_keeps_name(self, index): assert index.name == index[1:].name diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7f9a5c0b50595..59ec66ecc1fe9 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1,8 +1,5 @@ from collections import defaultdict -from datetime import ( - datetime, - timedelta, -) +from datetime import datetime from io import StringIO import math import re @@ -10,10 +7,7 @@ import numpy as np import pytest -from pandas.compat import ( - IS64, - np_datetime64_compat, -) +from pandas.compat import IS64 from pandas.util._test_decorators import async_mark import pandas as pd @@ -27,7 +21,6 @@ RangeIndex, Series, TimedeltaIndex, - Timestamp, date_range, period_range, ) @@ -219,91 +212,6 @@ def test_constructor_simple_new(self, vals, dtype): result = index._simple_new(index.values, dtype) tm.assert_index_equal(result, index) - @pytest.mark.parametrize( - "vals", - [ - [1, 2, 3], - np.array([1, 2, 3]), - np.array([1, 2, 3], dtype=int), - # below should coerce - [1.0, 2.0, 3.0], - np.array([1.0, 2.0, 3.0], dtype=float), - ], - ) - def test_constructor_dtypes_to_int64(self, vals): - index = Index(vals, dtype=int) - assert isinstance(index, Int64Index) - - @pytest.mark.parametrize( - "vals", - [ - [1, 2, 3], - [1.0, 2.0, 3.0], - np.array([1.0, 2.0, 3.0]), - np.array([1, 2, 3], dtype=int), - np.array([1.0, 2.0, 3.0], dtype=float), - ], - ) - def test_constructor_dtypes_to_float64(self, vals): - index = Index(vals, dtype=float) - assert isinstance(index, Float64Index) - - @pytest.mark.parametrize( - "vals", - [ - [1, 2, 3], - np.array([1, 2, 3], dtype=int), - np.array( - [np_datetime64_compat("2011-01-01"), np_datetime64_compat("2011-01-02")] - ), - [datetime(2011, 1, 1), datetime(2011, 1, 2)], - ], - ) - def test_constructor_dtypes_to_categorical(self, vals): - index = Index(vals, dtype="category") - assert isinstance(index, CategoricalIndex) - - @pytest.mark.parametrize("cast_index", [True, False]) - @pytest.mark.parametrize( - "vals", - [ - Index( - np.array( - [ - np_datetime64_compat("2011-01-01"), - np_datetime64_compat("2011-01-02"), - ] - ) - ), - Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]), - ], - ) - def test_constructor_dtypes_to_datetime(self, cast_index, vals): - if cast_index: - index = Index(vals, dtype=object) - assert isinstance(index, Index) - assert index.dtype == object - else: - index = Index(vals) - assert isinstance(index, DatetimeIndex) - - @pytest.mark.parametrize("cast_index", [True, False]) - @pytest.mark.parametrize( - "vals", - [ - np.array([np.timedelta64(1, "D"), np.timedelta64(1, "D")]), - [timedelta(1), timedelta(1)], - ], - ) - def test_constructor_dtypes_to_timedelta(self, cast_index, vals): - if cast_index: - index = Index(vals, dtype=object) - assert isinstance(index, Index) - assert index.dtype == object - else: - index = Index(vals) - assert isinstance(index, TimedeltaIndex) - @pytest.mark.filterwarnings("ignore:Passing keywords other:FutureWarning") @pytest.mark.parametrize("attr", ["values", "asi8"]) @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) @@ -726,20 +634,6 @@ def test_is_all_dates(self, index, expected): def test_summary(self, index): index._summary() - def test_summary_bug(self): - # GH3869` - ind = Index(["{other}%s", "~:{range}:0"], name="A") - result = ind._summary() - # shouldn't be formatted accidentally. - assert "~:{range}:0" in result - assert "{other}%s" in result - - def test_format_different_scalar_lengths(self): - # GH35439 - idx = Index(["aaaaaaaaa", "b"]) - expected = ["aaaaaaaaa", "b"] - assert idx.format() == expected - def test_format_bug(self): # GH 14626 # windows has different precision on datetime.datetime.now (it doesn't @@ -767,21 +661,6 @@ def test_format_missing(self, vals, nulls_fixture): assert formatted == expected assert index[3] is nulls_fixture - def test_format_with_name_time_info(self): - # bug I fixed 12/20/2011 - dates = date_range("2011-01-01 04:00:00", periods=10, name="something") - - formatted = dates.format(name=True) - assert formatted[0] == "something" - - def test_format_datetime_with_time(self): - t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) - - result = t.format() - expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] - assert len(result) == 2 - assert result == expected - @pytest.mark.parametrize("op", ["any", "all"]) def test_logical_compat(self, op, simple_index): index = simple_index @@ -1129,12 +1008,6 @@ def test_outer_join_sort(self): tm.assert_index_equal(result, expected) - def test_nan_first_take_datetime(self): - index = Index([pd.NaT, Timestamp("20130101"), Timestamp("20130102")]) - result = index.take([-1, 0, 1]) - expected = Index([index[-1], index[0], index[1]]) - tm.assert_index_equal(result, expected) - def test_take_fill_value(self): # GH 12631 index = Index(list("ABC"), name="xxx") diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index ed9243a5ba8d0..1592c34b48dd8 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -1,7 +1,7 @@ """ Collection of tests asserting things that should be true for -any index subclass. Makes use of the `indices` fixture defined -in pandas/tests/indexes/conftest.py. +any index subclass except for MultiIndex. Makes use of the `index_flat` +fixture defined in pandas/conftest.py. """ import re @@ -29,6 +29,26 @@ class TestCommon: + @pytest.mark.parametrize("name", [None, "new_name"]) + def test_to_frame(self, name, index_flat): + # see GH#15230, GH#22580 + idx = index_flat + + if name: + idx_name = name + else: + idx_name = idx.name or 0 + + df = idx.to_frame(name=idx_name) + + assert df.index is idx + assert len(df.columns) == 1 + assert df.columns[0] == idx_name + assert df[idx_name].values is not idx.values + + df = idx.to_frame(index=False, name=idx_name) + assert df.index is not idx + def test_droplevel(self, index): # GH 21115 if isinstance(index, MultiIndex): @@ -126,6 +146,46 @@ def test_copy_and_deepcopy(self, index_flat): new_copy = index.copy(deep=True, name="banana") assert new_copy.name == "banana" + def test_copy_name(self, index_flat): + # GH#12309: Check that the "name" argument + # passed at initialization is honored. + index = index_flat + + first = type(index)(index, copy=True, name="mario") + second = type(first)(first, copy=False) + + # Even though "copy=False", we want a new object. + assert first is not second + tm.assert_index_equal(first, second) + + # Not using tm.assert_index_equal() since names differ. + assert index.equals(first) + + assert first.name == "mario" + assert second.name == "mario" + + # TODO: belongs in series arithmetic tests? + s1 = pd.Series(2, index=first) + s2 = pd.Series(3, index=second[:-1]) + # See GH#13365 + s3 = s1 * s2 + assert s3.index.name == "mario" + + def test_copy_name2(self, index_flat): + # GH#35592 + index = index_flat + if isinstance(index, MultiIndex): + return + + assert index.copy(name="mario").name == "mario" + + with pytest.raises(ValueError, match="Length of new names must be 1, got 2"): + index.copy(name=["mario", "luigi"]) + + msg = f"{type(index).__name__}.name must be a hashable type" + with pytest.raises(TypeError, match=msg): + index.copy(name=[["mario"]]) + def test_unique_level(self, index_flat): # don't test a MultiIndex here (as its tested separated) index = index_flat diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 5c5ec7219d2d7..deeaffaf5b9cc 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -1,11 +1,17 @@ """ Tests for the Index constructor conducting inference. """ +from datetime import ( + datetime, + timedelta, +) from decimal import Decimal import numpy as np import pytest +from pandas.compat import np_datetime64_compat + from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas import ( @@ -27,6 +33,7 @@ ) import pandas._testing as tm from pandas.core.api import ( + Float64Index, Int64Index, UInt64Index, ) @@ -232,6 +239,91 @@ def test_constructor_int_dtype_nan_raises(self, dtype): with pytest.raises(ValueError, match=msg): Index(data, dtype=dtype) + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + np.array([1, 2, 3]), + np.array([1, 2, 3], dtype=int), + # below should coerce + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0], dtype=float), + ], + ) + def test_constructor_dtypes_to_int64(self, vals): + index = Index(vals, dtype=int) + assert isinstance(index, Int64Index) + + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0]), + np.array([1, 2, 3], dtype=int), + np.array([1.0, 2.0, 3.0], dtype=float), + ], + ) + def test_constructor_dtypes_to_float64(self, vals): + index = Index(vals, dtype=float) + assert isinstance(index, Float64Index) + + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + np.array([1, 2, 3], dtype=int), + np.array( + [np_datetime64_compat("2011-01-01"), np_datetime64_compat("2011-01-02")] + ), + [datetime(2011, 1, 1), datetime(2011, 1, 2)], + ], + ) + def test_constructor_dtypes_to_categorical(self, vals): + index = Index(vals, dtype="category") + assert isinstance(index, CategoricalIndex) + + @pytest.mark.parametrize("cast_index", [True, False]) + @pytest.mark.parametrize( + "vals", + [ + Index( + np.array( + [ + np_datetime64_compat("2011-01-01"), + np_datetime64_compat("2011-01-02"), + ] + ) + ), + Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]), + ], + ) + def test_constructor_dtypes_to_datetime(self, cast_index, vals): + if cast_index: + index = Index(vals, dtype=object) + assert isinstance(index, Index) + assert index.dtype == object + else: + index = Index(vals) + assert isinstance(index, DatetimeIndex) + + @pytest.mark.parametrize("cast_index", [True, False]) + @pytest.mark.parametrize( + "vals", + [ + np.array([np.timedelta64(1, "D"), np.timedelta64(1, "D")]), + [timedelta(1), timedelta(1)], + ], + ) + def test_constructor_dtypes_to_timedelta(self, cast_index, vals): + if cast_index: + index = Index(vals, dtype=object) + assert isinstance(index, Index) + assert index.dtype == object + else: + index = Index(vals) + assert isinstance(index, TimedeltaIndex) + class TestIndexConstructorUnwrapping: # Test passing different arraylike values to pd.Index diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 66fdaa2778600..0c2f8d0103ceb 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -21,14 +21,6 @@ class TestGetItem: - def test_ellipsis(self): - # GH#21282 - idx = timedelta_range("1 day", "31 day", freq="D", name="idx") - - result = idx[...] - assert result.equals(idx) - assert result is not idx - def test_getitem_slice_keeps_name(self): # GH#4226 tdi = timedelta_range("1d", "5d", freq="H", name="timebucket") diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index e46eed05caa86..332ab02255911 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -130,7 +130,7 @@ def test_nanosecond_getitem_setitem_with_tz(self): expected = DataFrame(-1, index=index, columns=["a"]) tm.assert_frame_equal(result, expected) - def test_getitem_millisecond_resolution(self, frame_or_series): + def test_getitem_str_slice_millisecond_resolution(self, frame_or_series): # GH#33589 keys = [ @@ -152,16 +152,3 @@ def test_getitem_millisecond_resolution(self, frame_or_series): ], ) tm.assert_equal(result, expected) - - def test_str_subclass(self): - # GH 37366 - class mystring(str): - pass - - data = ["2020-10-22 01:21:00+00:00"] - index = pd.DatetimeIndex(data) - df = DataFrame({"a": [1]}, index=index) - df["b"] = 2 - df[mystring("c")] = 3 - expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index) - tm.assert_equal(df, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 2805c8877ed78..6a9ece738952d 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -323,9 +323,9 @@ def test_dups_fancy_indexing3(self): def test_duplicate_int_indexing(self, indexer_sl): # GH 17347 - s = Series(range(3), index=[1, 1, 3]) - expected = s[1] - result = indexer_sl(s)[[1]] + ser = Series(range(3), index=[1, 1, 3]) + expected = Series(range(2), index=[1, 1]) + result = indexer_sl(ser)[[1]] tm.assert_series_equal(result, expected) def test_indexing_mixed_frame_bug(self): @@ -653,13 +653,6 @@ def test_loc_setitem_fullindex_views(self): df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) - def test_float_index_at_iat(self): - s = Series([1, 2, 3], index=[0.1, 0.2, 0.3]) - for el, item in s.items(): - assert s.at[el] == item - for i in range(len(s)): - assert s.iat[i] == i + 1 - def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases @@ -963,7 +956,11 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): # all numeric columns -> numeric series df = DataFrame( - {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"] + { + "A": pd.array([1, 2], dtype="Int64"), + "B": np.array([1, 2], dtype="int64"), + }, + index=["a", "b"], ) result = df.loc["a"] expected = Series([1, 1], dtype="Int64", index=["A", "B"], name="a") @@ -983,10 +980,3 @@ def test_extension_array_cross_section_converts(): result = df.iloc[0] tm.assert_series_equal(result, expected) - - -def test_getitem_object_index_float_string(): - # GH 17286 - s = Series([1] * 4, index=Index(["a", "b", "c", 1.0])) - assert s["a"] == 1 - assert s[1.0] == 1 diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index bf262e6755289..bcb76fb078e74 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -77,6 +77,13 @@ def _check(f, func, values=False): class TestAtAndiAT: # at and iat tests that don't need Base class + def test_float_index_at_iat(self): + ser = Series([1, 2, 3], index=[0.1, 0.2, 0.3]) + for el, item in ser.items(): + assert ser.at[el] == item + for i in range(len(ser)): + assert ser.iat[i] == i + 1 + def test_at_iat_coercion(self): # as timestamp is not a tuple! diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 03b1c512f9053..4c17917b949ca 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -36,6 +36,12 @@ class TestSeriesGetitemScalars: + def test_getitem_object_index_float_string(self): + # GH#17286 + ser = Series([1] * 4, index=Index(["a", "b", "c", 1.0])) + assert ser["a"] == 1 + assert ser[1.0] == 1 + def test_getitem_float_keys_tuple_values(self): # see GH#13509 From 07261dd6ea24e8e01904ac123ccf65eb9428f1cd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 18:06:11 -0800 Subject: [PATCH 39/53] disable xfail (#44436) --- pandas/tests/io/parser/test_compression.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index e0799df8d7a4c..5aa0edfd8b46a 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -103,8 +103,6 @@ def test_compression(parser_and_data, compression_only, buffer, filename): tm.write_to_compressed(compress_type, path, data) compression = "infer" if filename else compress_type - if ext == "bz2": - pytest.xfail("pyarrow wheels don't have bz2 codec support") if buffer: with open(path, "rb") as f: result = parser.read_csv(f, compression=compression) From 1a4a6891845ebe774bc3f1f08c5ef043a87f3823 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 18:06:51 -0800 Subject: [PATCH 40/53] REF: simplify putmask_smart (#44435) --- pandas/core/array_algos/putmask.py | 41 +++++++----------------------- pandas/core/internals/blocks.py | 16 ++++++++---- 2 files changed, 20 insertions(+), 37 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 77e38e6c6e3fc..1f37e0e5d249a 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -4,7 +4,6 @@ from __future__ import annotations from typing import Any -import warnings import numpy as np @@ -15,16 +14,12 @@ ) from pandas.core.dtypes.cast import ( + can_hold_element, convert_scalar_for_putitemlike, find_common_type, infer_dtype_from, ) -from pandas.core.dtypes.common import ( - is_float_dtype, - is_integer_dtype, - is_list_like, -) -from pandas.core.dtypes.missing import isna_compat +from pandas.core.dtypes.common import is_list_like from pandas.core.arrays import ExtensionArray @@ -75,7 +70,7 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd `values`, updated in-place. mask : np.ndarray[bool] Applies to both sides (array like). - new : `new values` either scalar or an array like aligned with `values` + new : listlike `new values` aligned with `values` Returns ------- @@ -89,9 +84,6 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd # we cannot use np.asarray() here as we cannot have conversions # that numpy does when numeric are mixed with strings - if not is_list_like(new): - new = np.broadcast_to(new, mask.shape) - # see if we are only masking values that if putted # will work in the current dtype try: @@ -100,27 +92,12 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd # TypeError: only integer scalar arrays can be converted to a scalar index pass else: - # make sure that we have a nullable type if we have nulls - if not isna_compat(values, nn[0]): - pass - elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): - # only compare integers/floats - pass - elif not (is_float_dtype(values.dtype) or is_integer_dtype(values.dtype)): - # only compare integers/floats - pass - else: - - # we ignore ComplexWarning here - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", np.ComplexWarning) - nn_at = nn.astype(values.dtype) - - comp = nn == nn_at - if is_list_like(comp) and comp.all(): - nv = values.copy() - nv[mask] = nn_at - return nv + # We only get to putmask_smart when we cannot hold 'new' in values. + # The "smart" part of putmask_smart is checking if we can hold new[mask] + # in values, in which case we can still avoid the need to cast. + if can_hold_element(values, nn): + values[mask] = nn + return values new = np.asarray(new) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 55e5b0d0439fa..e20bbb0d90fba 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -952,7 +952,8 @@ def putmask(self, mask, new) -> list[Block]: List[Block] """ orig_mask = mask - mask, noop = validate_putmask(self.values.T, mask) + values = cast(np.ndarray, self.values) + mask, noop = validate_putmask(values.T, mask) assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) # if we are passed a scalar None, convert it here @@ -960,7 +961,6 @@ def putmask(self, mask, new) -> list[Block]: new = self.fill_value if self._can_hold_element(new): - # error: Argument 1 to "putmask_without_repeat" has incompatible type # "Union[ndarray, ExtensionArray]"; expected "ndarray" putmask_without_repeat(self.values.T, mask, new) # type: ignore[arg-type] @@ -979,9 +979,15 @@ def putmask(self, mask, new) -> list[Block]: elif self.ndim == 1 or self.shape[0] == 1: # no need to split columns - # error: Argument 1 to "putmask_smart" has incompatible type "Union[ndarray, - # ExtensionArray]"; expected "ndarray" - nv = putmask_smart(self.values.T, mask, new).T # type: ignore[arg-type] + if not is_list_like(new): + # putmask_smart can't save us the need to cast + return self.coerce_to_target_dtype(new).putmask(mask, new) + + # This differs from + # `self.coerce_to_target_dtype(new).putmask(mask, new)` + # because putmask_smart will check if new[mask] may be held + # by our dtype. + nv = putmask_smart(values.T, mask, new).T return [self.make_block(nv)] else: From 4f05236220972b6185909e4d37d55f9cb9de9312 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 13 Nov 2021 21:08:32 -0500 Subject: [PATCH 41/53] ENH: Use stacklevel in warnings (#44439) --- pandas/core/algorithms.py | 5 +++-- pandas/core/arraylike.py | 4 +++- pandas/core/arrays/datetimelike.py | 3 +-- pandas/core/dtypes/cast.py | 9 +++------ pandas/core/generic.py | 5 +++-- pandas/core/groupby/grouper.py | 12 +++--------- pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/blocks.py | 2 +- pandas/core/series.py | 2 +- pandas/core/strings/accessor.py | 2 +- pandas/io/excel/_base.py | 3 +-- pandas/io/formats/style.py | 13 +++++++------ 12 files changed, 28 insertions(+), 34 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8c2c01b6aedc8..acc66ae9deca7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -35,6 +35,7 @@ npt, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, @@ -1550,7 +1551,7 @@ def searchsorted( _diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} -def diff(arr, n: int, axis: int = 0, stacklevel: int = 3): +def diff(arr, n: int, axis: int = 0): """ difference of n between self, analogous to s-s.shift(n) @@ -1596,7 +1597,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel: int = 3): "dtype lost in 'diff()'. In the future this will raise a " "TypeError. Convert to a suitable dtype prior to calling 'diff'.", FutureWarning, - stacklevel=stacklevel, + stacklevel=find_stack_level(), ) arr = np.asarray(arr) dtype = arr.dtype diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 11d32e8a159f3..d91404ff05157 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -337,7 +337,9 @@ def reconstruct(result): "Consider explicitly converting the DataFrame " "to an array with '.to_numpy()' first." ) - warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) + warnings.warn( + msg.format(ufunc), FutureWarning, stacklevel=find_stack_level() + ) return result raise NotImplementedError return result diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f8aa1656c8c30..2e1ebf9d5a266 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -416,13 +416,12 @@ def astype(self, dtype, copy: bool = True): elif is_integer_dtype(dtype): # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. - level = find_stack_level() warnings.warn( f"casting {self.dtype} values to int64 with .astype(...) is " "deprecated and will raise in a future version. " "Use .view(...) instead.", FutureWarning, - stacklevel=level, + stacklevel=find_stack_level(), ) values = self.asi8 diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2c26d6f838315..9cd67ad293f63 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -969,13 +969,12 @@ def astype_dt64_to_dt64tz( # this should be the only copy values = values.copy() - level = find_stack_level() warnings.warn( "Using .astype to convert from timezone-naive dtype to " "timezone-aware dtype is deprecated and will raise in a " "future version. Use ser.dt.tz_localize instead.", FutureWarning, - stacklevel=level, + stacklevel=find_stack_level(), ) # GH#33401 this doesn't match DatetimeArray.astype, which @@ -986,13 +985,12 @@ def astype_dt64_to_dt64tz( # DatetimeArray/DatetimeIndex.astype behavior if values.tz is None and aware: dtype = cast(DatetimeTZDtype, dtype) - level = find_stack_level() warnings.warn( "Using .astype to convert from timezone-naive dtype to " "timezone-aware dtype is deprecated and will raise in a " "future version. Use obj.tz_localize instead.", FutureWarning, - stacklevel=level, + stacklevel=find_stack_level(), ) return values.tz_localize(dtype.tz) @@ -1006,14 +1004,13 @@ def astype_dt64_to_dt64tz( return result elif values.tz is not None: - level = find_stack_level() warnings.warn( "Using .astype to convert from timezone-aware dtype to " "timezone-naive dtype is deprecated and will raise in a " "future version. Use obj.tz_localize(None) or " "obj.tz_convert('UTC').tz_localize(None) instead", FutureWarning, - stacklevel=level, + stacklevel=find_stack_level(), ) result = values.tz_convert("UTC").tz_localize(None) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6b51456006021..38a2cb46ad21d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -487,9 +487,10 @@ def _data(self): @property def _AXIS_NUMBERS(self) -> dict[str, int]: """.. deprecated:: 1.1.0""" - level = self.ndim + 1 warnings.warn( - "_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=level + "_AXIS_NUMBERS has been deprecated.", + FutureWarning, + stacklevel=find_stack_level(), ) return {"index": 0} diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7577b1e671d60..6cbe37c6b3838 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -21,6 +21,7 @@ ) from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import sanitize_to_nanoseconds from pandas.core.dtypes.common import ( @@ -964,8 +965,6 @@ def _check_deprecated_resample_kwargs(kwargs, origin): From where this function is being called; either Grouper or TimeGrouper. Used to determine an approximate stacklevel. """ - from pandas.core.resample import TimeGrouper - # Deprecation warning of `base` and `loffset` since v1.1.0: # we are raising the warning here to be able to set the `stacklevel` # properly since we need to raise the `base` and `loffset` deprecation @@ -975,11 +974,6 @@ def _check_deprecated_resample_kwargs(kwargs, origin): # core/groupby/grouper.py::Grouper # raising these warnings from TimeGrouper directly would fail the test: # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base - # hacky way to set the stacklevel: if cls is TimeGrouper it means - # that the call comes from a pandas internal call of resample, - # otherwise it comes from pd.Grouper - stacklevel = (5 if origin is TimeGrouper else 2) + 1 - # the + 1 is for this helper function, check_deprecated_resample_kwargs if kwargs.get("base", None) is not None: warnings.warn( @@ -989,7 +983,7 @@ def _check_deprecated_resample_kwargs(kwargs, origin): "\nbecomes:\n" '\n>>> df.resample(freq="3s", offset="2s")\n', FutureWarning, - stacklevel=stacklevel, + stacklevel=find_stack_level(), ) if kwargs.get("loffset", None) is not None: warnings.warn( @@ -1000,5 +994,5 @@ def _check_deprecated_resample_kwargs(kwargs, origin): '\n>>> df = df.resample(freq="3s").mean()' '\n>>> df.index = df.index.to_timestamp() + to_offset("8H")\n', FutureWarning, - stacklevel=stacklevel, + stacklevel=find_stack_level(), ) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 543b2ea26f750..1cd9fe65407ba 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -365,7 +365,7 @@ def diff(self: T, n: int, axis: int) -> T: # with axis=0 is equivalent assert n == 0 axis = 0 - return self.apply(algos.diff, n=n, axis=axis, stacklevel=5) + return self.apply(algos.diff, n=n, axis=axis) def interpolate(self: T, **kwargs) -> T: return self.apply_with_block("interpolate", swap_axis=False, **kwargs) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e20bbb0d90fba..46e5b5b9c53ad 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1128,7 +1128,7 @@ def take_nd( def diff(self, n: int, axis: int = 1) -> list[Block]: """return block for the diff of the values""" - new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) + new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Block]: diff --git a/pandas/core/series.py b/pandas/core/series.py index b3c9167bfbbab..e0a63b8e35105 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1012,7 +1012,7 @@ def _get_values_tuple(self, key): # mpl hackaround if com.any_none(*key): result = self._get_values(key) - deprecate_ndim_indexing(result, stacklevel=5) + deprecate_ndim_indexing(result, stacklevel=find_stack_level()) return result if not isinstance(self.index, MultiIndex): diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index f82e1aa5d188c..249fda9173b68 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1427,7 +1427,7 @@ def replace( " In addition, single character regular expressions will " "*not* be treated as literal strings when regex=True." ) - warnings.warn(msg, FutureWarning, stacklevel=3) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) # Check whether repl is valid (GH 13438, GH 15055) if not (isinstance(repl, str) or callable(repl)): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1caf334f9607e..ed79a5ad98ab9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -519,11 +519,10 @@ def parse( if convert_float is None: convert_float = True else: - stacklevel = find_stack_level() warnings.warn( "convert_float is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=stacklevel, + stacklevel=find_stack_level(), ) validate_header_arg(header) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index d91c0bb54f8dc..40803ff14e357 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -28,6 +28,7 @@ ) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level import pandas as pd from pandas import ( @@ -310,7 +311,7 @@ def render( warnings.warn( "this method is deprecated in favour of `Styler.to_html()`", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if sparse_index is None: sparse_index = get_option("styler.sparse.index") @@ -1675,7 +1676,7 @@ def where( warnings.warn( "this method is deprecated in favour of `Styler.applymap()`", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) if other is None: @@ -1707,7 +1708,7 @@ def set_precision(self, precision: int) -> StylerRenderer: warnings.warn( "this method is deprecated in favour of `Styler.format(precision=..)`", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) self.precision = precision return self.format(precision=precision, na_rep=self.na_rep) @@ -2217,7 +2218,7 @@ def set_na_rep(self, na_rep: str) -> StylerRenderer: warnings.warn( "this method is deprecated in favour of `Styler.format(na_rep=..)`", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) self.na_rep = na_rep return self.format(na_rep=na_rep, precision=self.precision) @@ -2271,7 +2272,7 @@ def hide_index( warnings.warn( "this method is deprecated in favour of `Styler.hide(axis='index')`", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.hide(axis=0, level=level, subset=subset, names=names) @@ -2324,7 +2325,7 @@ def hide_columns( warnings.warn( "this method is deprecated in favour of `Styler.hide(axis='columns')`", FutureWarning, - stacklevel=2, + stacklevel=find_stack_level(), ) return self.hide(axis=1, level=level, subset=subset, names=names) From 25b1224817a9dcfeeb3b7452359d73d3157a110b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 18:09:07 -0800 Subject: [PATCH 42/53] TST: FIXMES in DataFrame.quantile tests (#44437) --- pandas/tests/frame/methods/test_quantile.py | 48 ++++++++++++++++----- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 2e6318955e119..5773edbdbcdec 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -280,9 +280,13 @@ def test_quantile_datetime(self): tm.assert_frame_equal(result, expected) # empty when numeric_only=True - # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) - # result = df[['a', 'c']].quantile(.5) - # result = df[['a', 'c']].quantile([.5]) + result = df[["a", "c"]].quantile(0.5) + expected = Series([], index=[], dtype=np.float64, name=0.5) + tm.assert_series_equal(result, expected) + + result = df[["a", "c"]].quantile([0.5]) + expected = DataFrame(index=[0.5]) + tm.assert_frame_equal(result, expected) def test_quantile_invalid(self, datetime_frame): msg = "percentiles should all be in the interval \\[0, 1\\]" @@ -481,7 +485,7 @@ def test_quantile_nat(self): ) tm.assert_frame_equal(res, exp) - def test_quantile_empty_no_rows(self): + def test_quantile_empty_no_rows_floats(self): # floats df = DataFrame(columns=["a", "b"], dtype="float64") @@ -494,21 +498,43 @@ def test_quantile_empty_no_rows(self): exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) tm.assert_frame_equal(res, exp) - # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) - # res = df.quantile(0.5, axis=1) - # res = df.quantile([0.5], axis=1) + res = df.quantile(0.5, axis=1) + exp = Series([], index=[], dtype="float64", name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], axis=1) + exp = DataFrame(columns=[], index=[0.5]) + tm.assert_frame_equal(res, exp) + def test_quantile_empty_no_rows_ints(self): # ints df = DataFrame(columns=["a", "b"], dtype="int64") - # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) - # res = df.quantile(0.5) + res = df.quantile(0.5) + exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + def test_quantile_empty_no_rows_dt64(self): # datetimes df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") - # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) - # res = df.quantile(0.5, numeric_only=False) + res = df.quantile(0.5, numeric_only=False) + exp = Series( + [pd.NaT, pd.NaT], index=["a", "b"], dtype="datetime64[ns]", name=0.5 + ) + tm.assert_series_equal(res, exp) + + # Mixed dt64/dt64tz + df["a"] = df["a"].dt.tz_localize("US/Central") + res = df.quantile(0.5, numeric_only=False) + exp = exp.astype(object) + tm.assert_series_equal(res, exp) + + # both dt64tz + df["b"] = df["b"].dt.tz_localize("US/Central") + res = df.quantile(0.5, numeric_only=False) + exp = exp.astype(df["b"].dtype) + tm.assert_series_equal(res, exp) def test_quantile_empty_no_columns(self): # GH#23925 _get_numeric_data may drop all columns From 0e442be208232759c40998454495e07e15ef20b2 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 13 Nov 2021 18:11:15 -0800 Subject: [PATCH 43/53] BLD: Exclude CPT data files (#44441) --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index f616fad6b1557..c6ddc79eaa83c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -33,6 +33,7 @@ global-exclude *.xlsb global-exclude *.xlsm global-exclude *.xlsx global-exclude *.xpt +global-exclude *.cpt global-exclude *.xz global-exclude *.zip global-exclude *~ From 53ca6ebb6af398c8a5131329cc7b233724235148 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 18:12:09 -0800 Subject: [PATCH 44/53] BUG: DataFrame.astype(series) with duplicate columns (#44417) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/generic.py | 20 ++++++++++++++------ pandas/core/groupby/generic.py | 2 +- pandas/tests/frame/methods/test_astype.py | 20 ++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 10 ++++++++++ 5 files changed, 46 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 92fadf801cec7..3915e05bcad0a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -715,6 +715,7 @@ Styler Other ^^^^^ +- Bug in :meth:`DataFrame.astype` with non-unique columns and a :class:`Series` ``dtype`` argument (:issue:`44417`) - Bug in :meth:`CustomBusinessMonthBegin.__add__` (:meth:`CustomBusinessMonthEnd.__add__`) not applying the extra ``offset`` parameter when beginning (end) of the target month is already a business day (:issue:`41356`) - Bug in :meth:`RangeIndex.union` with another ``RangeIndex`` with matching (even) ``step`` and starts differing by strictly less than ``step / 2`` (:issue:`44019`) - Bug in :meth:`RangeIndex.difference` with ``sort=None`` and ``step<0`` failing to sort (:issue:`44085`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 38a2cb46ad21d..45ae979d5a138 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5827,14 +5827,22 @@ def astype( "Only a column name can be used for the " "key in a dtype mappings argument." ) + + # GH#44417 cast to Series so we can use .iat below, which will be + # robust in case we + from pandas import Series + + dtype_ser = Series(dtype, dtype=object) + dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False) + results = [] - for col_name, col in self.items(): - if col_name in dtype: - results.append( - col.astype(dtype=dtype[col_name], copy=copy, errors=errors) - ) + for i, (col_name, col) in enumerate(self.items()): + cdt = dtype_ser.iat[i] + if isna(cdt): + res_col = col.copy() if copy else col else: - results.append(col.copy() if copy else col) + res_col = col.astype(dtype=cdt, copy=copy, errors=errors) + results.append(res_col) elif is_extension_array_dtype(dtype) and self.ndim > 1: # GH 18099/22869: columnwise conversion to extension dtype diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3c45f7263265c..b8354e800753d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -992,7 +992,7 @@ def _wrap_applied_output( result = self.obj._constructor( index=self.grouper.result_index, columns=data.columns ) - result = result.astype(data.dtypes.to_dict(), copy=False) + result = result.astype(data.dtypes, copy=False) return result # GH12824 diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 9f1f953cecc7e..e5e07761fd755 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -261,6 +261,26 @@ def test_astype_duplicate_col(self): expected = concat([a1_str, b, a2_str], axis=1) tm.assert_frame_equal(result, expected) + def test_astype_duplicate_col_series_arg(self): + # GH#44417 + vals = np.random.randn(3, 4) + df = DataFrame(vals, columns=["A", "B", "C", "A"]) + dtypes = df.dtypes + dtypes.iloc[0] = str + dtypes.iloc[2] = "Float64" + + result = df.astype(dtypes) + expected = DataFrame( + { + 0: vals[:, 0].astype(str), + 1: vals[:, 1], + 2: pd.array(vals[:, 2], dtype="Float64"), + 3: vals[:, 3], + } + ) + expected.columns = df.columns + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 203d8abb465d0..f632da9616124 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2031,6 +2031,16 @@ def get_result(): tm.assert_equal(result, expected) +def test_empty_groupby_apply_nonunique_columns(): + # GH#44417 + df = DataFrame(np.random.randn(0, 4)) + df[3] = df[3].astype(np.int64) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + res = gb.apply(lambda x: x) + assert (res.dtypes == df.dtypes).all() + + def test_tuple_as_grouping(): # https://github.com/pandas-dev/pandas/issues/18314 df = DataFrame( From a3c964dc03cca62d0dbe0946ab068423087c5638 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 14 Nov 2021 03:13:43 +0100 Subject: [PATCH 45/53] DOC: Some minor doc cleanups (#44440) --- doc/source/user_guide/io.rst | 9 +++++---- pandas/core/frame.py | 6 ++++-- pandas/core/indexes/base.py | 14 -------------- pandas/io/parsers/readers.py | 9 +++++---- 4 files changed, 14 insertions(+), 24 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c2ca3df5ca23d..e2f8ac09d8873 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -102,7 +102,7 @@ header : int or list of ints, default ``'infer'`` names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should explicitly pass ``header=None``. Duplicates in this list are not allowed. -index_col : int, str, sequence of int / str, or False, default ``None`` +index_col : int, str, sequence of int / str, or False, optional, default ``None`` Column(s) to use as the row labels of the ``DataFrame``, either given as string name or column index. If a sequence of int / str is given, a MultiIndex is used. @@ -120,7 +120,8 @@ usecols : list-like or callable, default ``None`` Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in ``names`` or - inferred from the document header row(s). For example, a valid list-like + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To @@ -348,7 +349,7 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None`` Error handling ++++++++++++++ -error_bad_lines : boolean, default ``None`` +error_bad_lines : boolean, optional, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no ``DataFrame`` will be returned. If ``False``, then these "bad lines" will dropped from the @@ -358,7 +359,7 @@ error_bad_lines : boolean, default ``None`` .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -warn_bad_lines : boolean, default ``None`` +warn_bad_lines : boolean, optional, default ``None`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for each "bad line" will be output. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 212bb63693d56..1b89eeddcf9df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1044,12 +1044,14 @@ def _repr_html_(self) -> str | None: return None @Substitution( - header_type="bool or sequence", + header_type="bool or sequence of strings", header="Write out the column names. If a list of strings " "is given, it is assumed to be aliases for the " "column names", col_space_type="int, list or dict of int", - col_space="The minimum width of each column", + col_space="The minimum width of each column. If a list of ints is given " + "every integers corresponds with one column. If a dict is given, the key " + "references the column, while the value defines the space to use.", ) @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) def to_string( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9715bf8f61f3c..a8896c1fde546 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6681,8 +6681,6 @@ def all(self, *args, **kwargs): Examples -------- - **all** - True, because nonzero integers are considered True. >>> pd.Index([1, 2, 3]).all() @@ -6692,18 +6690,6 @@ def all(self, *args, **kwargs): >>> pd.Index([0, 1, 2]).all() False - - **any** - - True, because ``1`` is considered True. - - >>> pd.Index([0, 0, 1]).any() - True - - False, because ``0`` is considered False. - - >>> pd.Index([0, 0, 0]).any() - False """ nv.validate_all(args, kwargs) self._maybe_disable_logical_methods("all") diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6fb9497dbc1d6..0b57f0f5ef814 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -104,7 +104,7 @@ List of column names to use. If the file contains a header row, then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. -index_col : int, str, sequence of int / str, or False, default ``None`` +index_col : int, str, sequence of int / str, or False, optional, default ``None`` Column(s) to use as the row labels of the ``DataFrame``, either given as string name or column index. If a sequence of int / str is given, a MultiIndex is used. @@ -116,7 +116,8 @@ Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid list-like + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To instantiate a DataFrame from ``data`` with element order preserved use @@ -331,7 +332,7 @@ `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See csv.Dialect documentation for more details. -error_bad_lines : bool, default ``None`` +error_bad_lines : bool, optional, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these "bad lines" will be dropped from the DataFrame that is @@ -340,7 +341,7 @@ .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -warn_bad_lines : bool, default ``None`` +warn_bad_lines : bool, optional, default ``None`` If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. From effe737e9dd4a1535a7560a9694527bf5750515c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 14 Nov 2021 03:17:44 +0100 Subject: [PATCH 46/53] BUG: read_csv raising if parse_dates is used with MultiIndex columns (#44408) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parsers/base_parser.py | 11 ++++++-- pandas/tests/io/parser/test_parse_dates.py | 33 ++++++++++++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 3915e05bcad0a..4aa8af3cfed6a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -634,6 +634,7 @@ I/O - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`) - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) +- Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) - Period diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 6374f52f6964b..043eb34e18798 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -260,7 +260,8 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None: # ParseDates = Union[DateGroups, List[DateGroups], # Dict[ColReference, DateGroups]] cols_needed = itertools.chain.from_iterable( - col if is_list_like(col) else [col] for col in self.parse_dates + col if is_list_like(col) and not isinstance(col, tuple) else [col] + for col in self.parse_dates ) else: cols_needed = [] @@ -1092,7 +1093,7 @@ def _isindex(colspec): if isinstance(parse_spec, list): # list of column lists for colspec in parse_spec: - if is_scalar(colspec): + if is_scalar(colspec) or isinstance(colspec, tuple): if isinstance(colspec, int) and colspec not in data_dict: colspec = orig_names[colspec] if _isindex(colspec): @@ -1147,7 +1148,11 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns): else: colnames.append(c) - new_name = "_".join([str(x) for x in colnames]) + new_name: tuple | str + if all(isinstance(x, tuple) for x in colnames): + new_name = tuple(map("_".join, zip(*colnames))) + else: + new_name = "_".join([str(x) for x in colnames]) to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] new_col = parser(*to_parse) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c8bea9592e82a..470440290016d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1732,6 +1732,39 @@ def test_date_parser_and_names(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow +def test_date_parser_multiindex_columns(all_parsers): + parser = all_parsers + data = """a,b +1,2 +2019-12-31,6""" + result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +@pytest.mark.parametrize( + "parse_spec, col_name", + [ + ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), + ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), + ], +) +def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): + parser = all_parsers + data = """a,b,c +1,2,3 +2019-12,-31,6""" + result = parser.read_csv( + StringIO(data), + parse_dates=parse_spec, + header=[0, 1], + ) + expected = DataFrame({col_name: Timestamp("2019-12-31"), ("c", "3"): [6]}) + tm.assert_frame_equal(result, expected) + + @skip_pyarrow def test_date_parser_usecols_thousands(all_parsers): # GH#39365 From d4106aa27ab126e3242d793b23af3ed419679e6b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 14 Nov 2021 03:23:34 +0100 Subject: [PATCH 47/53] Doc: Clean obj.empty docs to describe Series/DataFrame (#44430) --- pandas/core/generic.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 45ae979d5a138..fd8af2c0cedd0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2003,15 +2003,15 @@ def __contains__(self, key) -> bool_t: @property def empty(self) -> bool_t: """ - Indicator whether DataFrame is empty. + Indicator whether Series/DataFrame is empty. - True if DataFrame is entirely empty (no items), meaning any of the + True if Series/DataFrame is entirely empty (no items), meaning any of the axes are of length 0. Returns ------- bool - If DataFrame is empty, return True, if not return False. + If Series/DataFrame is empty, return True, if not return False. See Also -------- @@ -2021,7 +2021,7 @@ def empty(self) -> bool_t: Notes ----- - If DataFrame contains only NaNs, it is still not considered empty. See + If Series/DataFrame contains only NaNs, it is still not considered empty. See the example below. Examples @@ -2047,6 +2047,16 @@ def empty(self) -> bool_t: False >>> df.dropna().empty True + + >>> ser_empty = pd.Series({'A' : []}) + >>> ser_empty + A [] + dtype: object + >>> ser_empty.empty + False + >>> ser_empty = pd.Series() + >>> ser_empty.empty + True """ return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS) From 72e60976f65efcfdec9edebb997de840e4d25c04 Mon Sep 17 00:00:00 2001 From: Johannes Mueller Date: Sun, 14 Nov 2021 03:26:43 +0100 Subject: [PATCH 48/53] BUG: .get_indexer_non_unique() must return an array of ints (#44084) (#44404) --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/indexes/interval.py | 2 ++ .../tests/indexes/interval/test_indexing.py | 27 +++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 4aa8af3cfed6a..2b2040e4b51a7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -573,7 +573,7 @@ Strings Interval ^^^^^^^^ -- +- Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`) - Indexing diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 5791f89828ca3..885c922d1ee0f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -727,6 +727,8 @@ def _get_indexer_pointwise( if isinstance(locs, slice): # Only needed for get_indexer_non_unique locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + elif not self.is_unique and not self.is_monotonic: + locs = np.where(locs)[0] locs = np.array(locs, ndmin=1) except KeyError: missing.append(i) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index f12f32724b9e1..7c00b23dc9ac4 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -8,8 +8,10 @@ from pandas import ( NA, CategoricalIndex, + Index, Interval, IntervalIndex, + MultiIndex, NaT, Series, Timedelta, @@ -374,6 +376,31 @@ def test_get_indexer_with_nans(self): expected = np.array([0, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_index_non_unique_non_monotonic(self): + # GH#44084 (root cause) + index = IntervalIndex.from_tuples( + [(0.0, 1.0), (1.0, 2.0), (0.0, 1.0), (1.0, 2.0)] + ) + + result, _ = index.get_indexer_non_unique([Interval(1.0, 2.0)]) + expected = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_multiindex_with_intervals(self): + # GH#44084 (MultiIndex case as reported) + interval_index = IntervalIndex.from_tuples( + [(2.0, 3.0), (0.0, 1.0), (1.0, 2.0)], name="interval" + ) + foo_index = Index([1, 2, 3], name="foo") + + multi_index = MultiIndex.from_product([foo_index, interval_index]) + + result = multi_index.get_level_values("interval").get_indexer_for( + [Interval(0.0, 1.0)] + ) + expected = np.array([1, 4, 7], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestSliceLocs: def test_slice_locs_with_interval(self): From acffba29cd2f74d82597a8636740b60f77705e21 Mon Sep 17 00:00:00 2001 From: Evgeny Naumov Date: Sat, 13 Nov 2021 21:34:44 -0500 Subject: [PATCH 49/53] BUG: closes #44312: fixes unwanted TypeError when a missing metadata field is missing (#44325) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/json/_normalize.py | 2 ++ pandas/tests/io/json/test_normalize.py | 27 ++++++++++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2b2040e4b51a7..cf44912b33a16 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -632,6 +632,7 @@ I/O - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`) - Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`) - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`) +- Bug in :func:`json_normalize` where reading data with missing multi-level metadata would not respect errors="ignore" (:issue:`44312`) - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 90fd5d077d031..2c2c127394fb6 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -389,6 +389,8 @@ def _pull_field( try: if isinstance(spec, list): for field in spec: + if result is None: + raise KeyError(field) result = result[field] else: result = result[spec] diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a2b90f607e918..272a4aa6723dd 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -634,6 +634,33 @@ def test_missing_meta(self, missing_metadata): expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) + def test_missing_nested_meta(self): + # GH44312 + # If errors="ignore" and nested metadata is null, we should return nan + data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]} + result = json_normalize( + data, + record_path="value", + meta=["meta", ["nested_meta", "leaf"]], + errors="ignore", + ) + ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]] + columns = ["rec", "meta", "nested_meta.leaf"] + expected = DataFrame(ex_data, columns=columns).astype( + {"nested_meta.leaf": object} + ) + tm.assert_frame_equal(result, expected) + + # If errors="raise" and nested metadata is null, we should raise with the + # key of the first missing level + with pytest.raises(KeyError, match="'leaf' not found"): + json_normalize( + data, + record_path="value", + meta=["meta", ["nested_meta", "leaf"]], + errors="raise", + ) + def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata): # GH41876 # Ensure errors='raise' works as intended even when a record_path of length From 01a6f4bba4088a5bb72e7d10cf33b88db27959e7 Mon Sep 17 00:00:00 2001 From: Loic Diridollou Date: Sat, 13 Nov 2021 18:40:07 -0800 Subject: [PATCH 50/53] DOC: df.to_html documentation incorrectly contains min_rows optional param (#44331) --- pandas/core/frame.py | 14 ++++++++------ pandas/io/formats/format.py | 3 --- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1b89eeddcf9df..b88c97b8e988d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1068,11 +1068,11 @@ def to_string( index_names: bool = True, justify: str | None = None, max_rows: int | None = None, - min_rows: int | None = None, max_cols: int | None = None, show_dimensions: bool = False, decimal: str = ".", line_width: int | None = None, + min_rows: int | None = None, max_colwidth: int | None = None, encoding: str | None = None, ) -> str | None: @@ -1081,6 +1081,9 @@ def to_string( %(shared_params)s line_width : int, optional Width to wrap a line in characters. + min_rows : int, optional + The number of rows to display in the console in a truncated repr + (when number of rows is above `max_rows`). max_colwidth : int, optional Max width to truncate each column in characters. By default, no limit. @@ -2839,15 +2842,14 @@ def to_html( border : int A ``border=border`` attribute is included in the opening `` tag. Default ``pd.options.display.html.border``. - encoding : str, default "utf-8" - Set character encoding. - - .. versionadded:: 1.0 - table_id : str, optional A css id is included in the opening `
` tag if specified. render_links : bool, default False Convert URLs to HTML links. + encoding : str, default "utf-8" + Set character encoding. + + .. versionadded:: 1.0 %(returns)s See Also -------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ba85a1b340d05..ca53bfb7d5e08 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -164,9 +164,6 @@ * unset. max_rows : int, optional Maximum number of rows to display in the console. - min_rows : int, optional - The number of rows to display in the console in a truncated repr - (when number of rows is above `max_rows`). max_cols : int, optional Maximum number of columns to display in the console. show_dimensions : bool, default False From 5307b18022dbf05eef0be99c4b2fad4080fb7848 Mon Sep 17 00:00:00 2001 From: brendandrury <72849852+brendandrury@users.noreply.github.com> Date: Sat, 13 Nov 2021 19:08:55 -0800 Subject: [PATCH 51/53] [BUG] Fix DataFrameGroupBy.boxplot with subplots=False fails for object columns (#44003) --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/plotting/_matplotlib/boxplot.py | 5 +++++ pandas/tests/plotting/test_boxplot_method.py | 8 ++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index cf44912b33a16..dd0561efc37e7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -646,7 +646,7 @@ Period Plotting ^^^^^^^^ -- +- When given non-numeric data, :meth:`DataFrame.boxplot` now raises a ``ValueError`` rather than a cryptic ``KeyError`` or ``ZeroDivsionError``, in line with other plotting functions like :meth:`DataFrame.hist`. (:issue:`43480`) - Groupby/resample/rolling diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 1308a83f61443..a2089de294e22 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -391,6 +391,11 @@ def plot_group(keys, values, ax: Axes): with plt.rc_context(rc): ax = plt.gca() data = data._get_numeric_data() + naxes = len(data.columns) + if naxes == 0: + raise ValueError( + "boxplot method requires numerical columns, nothing to plot." + ) if columns is None: columns = data.columns else: diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index dbceeae44a493..ce32e5801e461 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -543,6 +543,14 @@ def test_groupby_boxplot_subplots_false(self, col, expected_xticklabel): result_xticklabel = [x.get_text() for x in axes.get_xticklabels()] assert expected_xticklabel == result_xticklabel + def test_groupby_boxplot_object(self): + # GH 43480 + df = self.hist_df.astype("object") + grouped = df.groupby("gender") + msg = "boxplot method requires numerical columns, nothing to plot" + with pytest.raises(ValueError, match=msg): + _check_plot_works(grouped.boxplot, subplots=False) + def test_boxplot_multiindex_column(self): # GH 16748 arrays = [ From 564c8575212e424185ad5ea3d440db79ce0e3a45 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Sat, 13 Nov 2021 22:19:08 -0500 Subject: [PATCH 52/53] ENH: Support timespec argument in Timestamp.isoformat() (#44397) * ENH: Support timespec argument in Timestamp.isoformat() * Get rid of tabs * Copy isoformat docstring to NaTType * Remove NaT docstring changes & update NaT tests * Fix another black issue --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/tslibs/nattype.pyx | 2 +- pandas/_libs/tslibs/timestamps.pyx | 48 +++++++++++-- pandas/tests/scalar/test_nat.py | 5 ++ pandas/tests/scalar/timestamp/test_formats.py | 71 +++++++++++++++++++ 5 files changed, 119 insertions(+), 8 deletions(-) create mode 100644 pandas/tests/scalar/timestamp/test_formats.py diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index dd0561efc37e7..9c99f23527c47 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -209,6 +209,7 @@ Other enhancements - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) +- :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 2aebf75ba35d4..09bfc4527a428 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -295,7 +295,7 @@ cdef class _NaT(datetime): def __str__(self) -> str: return "NaT" - def isoformat(self, sep="T") -> str: + def isoformat(self, sep: str = "T", timespec: str = "auto") -> str: # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. return "NaT" diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 613da5a691736..28b8158548ca8 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -737,9 +737,42 @@ cdef class _Timestamp(ABCTimestamp): # ----------------------------------------------------------------- # Rendering Methods - def isoformat(self, sep: str = "T") -> str: - base = super(_Timestamp, self).isoformat(sep=sep) - if self.nanosecond == 0: + def isoformat(self, sep: str = "T", timespec: str = "auto") -> str: + """ + Return the time formatted according to ISO. + + The full format looks like 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn'. + By default, the fractional part is omitted if self.microsecond == 0 + and self.nanosecond == 0. + + If self.tzinfo is not None, the UTC offset is also attached, giving + giving a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn+HH:MM'. + + Parameters + ---------- + sep : str, default 'T' + String used as the separator between the date and time. + + timespec : str, default 'auto' + Specifies the number of additional terms of the time to include. + The valid values are 'auto', 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds', and 'nanoseconds'. + + Returns + ------- + str + + Examples + -------- + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> ts.isoformat() + '2020-03-14T15:32:52.192548651' + >>> ts.isoformat(timespec='microseconds') + '2020-03-14T15:32:52.192548' + """ + base_ts = "microseconds" if timespec == "nanoseconds" else timespec + base = super(_Timestamp, self).isoformat(sep=sep, timespec=base_ts) + if self.nanosecond == 0 and timespec != "nanoseconds": return base if self.tzinfo is not None: @@ -747,10 +780,11 @@ cdef class _Timestamp(ABCTimestamp): else: base1, base2 = base, "" - if self.microsecond != 0: - base1 += f"{self.nanosecond:03d}" - else: - base1 += f".{self.nanosecond:09d}" + if timespec == "nanoseconds" or (timespec == "auto" and self.nanosecond): + if self.microsecond: + base1 += f"{self.nanosecond:03d}" + else: + base1 += f".{self.nanosecond:09d}" return base1 + base2 diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 21ed57813b60d..b9718249b38c8 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -182,6 +182,7 @@ def test_nat_methods_nat(method): def test_nat_iso_format(get_nat): # see gh-12300 assert get_nat("NaT").isoformat() == "NaT" + assert get_nat("NaT").isoformat(timespec="nanoseconds") == "NaT" @pytest.mark.parametrize( @@ -325,6 +326,10 @@ def test_nat_doc_strings(compare): klass, method = compare klass_doc = getattr(klass, method).__doc__ + # Ignore differences with Timestamp.isoformat() as they're intentional + if klass == Timestamp and method == "isoformat": + return + nat_doc = getattr(NaT, method).__doc__ assert klass_doc == nat_doc diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py new file mode 100644 index 0000000000000..71dbf3539bdb2 --- /dev/null +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -0,0 +1,71 @@ +import pytest + +from pandas import Timestamp + +ts_no_ns = Timestamp( + year=2019, + month=5, + day=18, + hour=15, + minute=17, + second=8, + microsecond=132263, +) +ts_ns = Timestamp( + year=2019, + month=5, + day=18, + hour=15, + minute=17, + second=8, + microsecond=132263, + nanosecond=123, +) +ts_ns_tz = Timestamp( + year=2019, + month=5, + day=18, + hour=15, + minute=17, + second=8, + microsecond=132263, + nanosecond=123, + tz="UTC", +) +ts_no_us = Timestamp( + year=2019, + month=5, + day=18, + hour=15, + minute=17, + second=8, + microsecond=0, + nanosecond=123, +) + + +@pytest.mark.parametrize( + "ts, timespec, expected_iso", + [ + (ts_no_ns, "auto", "2019-05-18T15:17:08.132263"), + (ts_no_ns, "seconds", "2019-05-18T15:17:08"), + (ts_no_ns, "nanoseconds", "2019-05-18T15:17:08.132263000"), + (ts_ns, "auto", "2019-05-18T15:17:08.132263123"), + (ts_ns, "hours", "2019-05-18T15"), + (ts_ns, "minutes", "2019-05-18T15:17"), + (ts_ns, "seconds", "2019-05-18T15:17:08"), + (ts_ns, "milliseconds", "2019-05-18T15:17:08.132"), + (ts_ns, "microseconds", "2019-05-18T15:17:08.132263"), + (ts_ns, "nanoseconds", "2019-05-18T15:17:08.132263123"), + (ts_ns_tz, "auto", "2019-05-18T15:17:08.132263123+00:00"), + (ts_ns_tz, "hours", "2019-05-18T15+00:00"), + (ts_ns_tz, "minutes", "2019-05-18T15:17+00:00"), + (ts_ns_tz, "seconds", "2019-05-18T15:17:08+00:00"), + (ts_ns_tz, "milliseconds", "2019-05-18T15:17:08.132+00:00"), + (ts_ns_tz, "microseconds", "2019-05-18T15:17:08.132263+00:00"), + (ts_ns_tz, "nanoseconds", "2019-05-18T15:17:08.132263123+00:00"), + (ts_no_us, "auto", "2019-05-18T15:17:08.000000123"), + ], +) +def test_isoformat(ts, timespec, expected_iso): + assert ts.isoformat(timespec=timespec) == expected_iso From 4689a288b9aa350a9d1c7de4eac40e7dff3ecc3f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 13 Nov 2021 19:20:32 -0800 Subject: [PATCH 53/53] DEPR: PeriodIndex.astype(dt64) (#44398) --- doc/source/whatsnew/v1.4.0.rst | 2 ++ pandas/core/indexes/period.py | 8 ++++++++ pandas/tests/indexes/period/methods/test_astype.py | 5 ++++- pandas/tests/indexes/test_common.py | 3 +++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9c99f23527c47..a593a03de5c25 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -458,6 +458,8 @@ Other Deprecations - Deprecated casting behavior when setting timezone-aware value(s) into a timezone-aware :class:`Series` or :class:`DataFrame` column when the timezones do not match. Previously this cast to object dtype. In a future version, the values being inserted will be converted to the series or column's existing timezone (:issue:`37605`) - Deprecated casting behavior when passing an item with mismatched-timezone to :meth:`DatetimeIndex.insert`, :meth:`DatetimeIndex.putmask`, :meth:`DatetimeIndex.where` :meth:`DatetimeIndex.fillna`, :meth:`Series.mask`, :meth:`Series.where`, :meth:`Series.fillna`, :meth:`Series.shift`, :meth:`Series.replace`, :meth:`Series.reindex` (and :class:`DataFrame` column analogues). In the past this has cast to object dtype. In a future version, these will cast the passed item to the index or series's timezone (:issue:`37605`) - Deprecated the 'errors' keyword argument in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, and meth:`DataFrame.mask`; in a future version the argument will be removed (:issue:`44294`) +- Deprecated :meth:`PeriodIndex.astype` to ``datetime64[ns]`` or ``DatetimeTZDtype``, use ``obj.to_timestamp(how).tz_localize(dtype.tz)`` instead (:issue:`44398`) +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 23851eff252b4..e3e1589d91e09 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -354,6 +354,14 @@ def astype(self, dtype, copy: bool = True, how=lib.no_default): if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. + # GH#44398 deprecate astype(dt64), matching Series behavior + warnings.warn( + f"Converting {type(self).__name__} to DatetimeIndex with " + "'astype' is deprecated and will raise in a future version. " + "Use `obj.to_timestamp(how).tz_localize(dtype.tz)` instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index e2340a2db02f7..c44f2efed1fcc 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -164,7 +164,10 @@ def test_period_astype_to_timestamp(self): assert res.freq == exp.freq exp = DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], tz="US/Eastern") - res = pi.astype("datetime64[ns, US/Eastern]") + msg = "Use `obj.to_timestamp" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#44398 + res = pi.astype("datetime64[ns, US/Eastern]") tm.assert_index_equal(res, exp) assert res.freq == exp.freq diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 1592c34b48dd8..80edaf77fe960 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -392,6 +392,9 @@ def test_astype_preserves_name(self, index, dtype): ): # This astype is deprecated in favor of tz_localize warn = FutureWarning + elif isinstance(index, PeriodIndex) and dtype == "datetime64[ns]": + # Deprecated in favor of to_timestamp GH#44398 + warn = FutureWarning try: # Some of these conversions cannot succeed so we use a try / except with tm.assert_produces_warning(warn):