From 1fb9088811d2555fa87e19f3ca43dc8c79fe5d4c Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Wed, 22 Feb 2023 17:39:42 +0100 Subject: [PATCH 001/577] Add fix + tests for index with dtype s3 --- pandas/core/indexes/base.py | 10 ++++++++-- pandas/tests/dtypes/test_dtypes.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bd631c0c0d948..e74c6fefda67d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -501,6 +501,10 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) + # GH#21470 we must update the `dtype` to avoid passing e.g. the + # numpy type `S3` to `_dtype_to_subclass`, which would raise a + # NotImplementedError. + dtype = _dtype_obj elif is_scalar(data): raise cls._raise_scalar_data_error(data) @@ -532,10 +536,12 @@ def __new__( if len(data) == 0: # unlike Series, we default to object dtype: data = np.array(data, dtype=object) - - if len(data) and isinstance(data[0], tuple): + else: # Ensure we get 1-D array of tuples instead of 2D array. data = com.asarray_tuplesafe(data, dtype=_dtype_obj) + if not(len(data) and isinstance(data[0], tuple)): + data = astype_array(data, dtype=dtype, copy=copy) + return Index(data, dtype=dtype, copy=copy, name=name) try: arr = sanitize_array(data, None, dtype=dtype, copy=copy) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 590cedeb6b373..a149021084aa4 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -18,6 +18,7 @@ is_interval_dtype, is_period_dtype, is_string_dtype, + pandas_dtype ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -1117,3 +1118,14 @@ def test_multi_column_dtype_assignment(): df["b"] = 0 tm.assert_frame_equal(df, expected) + +def test_numpy_s3_dtype_on_index(): + # GH #21470 + index = pd.Index(['abcd', '1234'], dtype='S3') + expected = pd.Index(['abc', '123'], dtype='S3') + assert index.dtype == pandas_dtype('object') + tm.assert_frame_equal(index, expected) + + index = pd.Index(['abcd', '1234']) + expected = pd.Index(['abc', '123'], dtype='S3') + tm.assert_frame_equal(index.astype('S3'), expected) \ No newline at end of file From 6769638a542f88576dae6132ab8fc25f45ba128d Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Wed, 22 Feb 2023 17:53:35 +0100 Subject: [PATCH 002/577] Fix formatting --- pandas/tests/dtypes/test_dtypes.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index a149021084aa4..dfd1abe78187f 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -18,7 +18,7 @@ is_interval_dtype, is_period_dtype, is_string_dtype, - pandas_dtype + pandas_dtype, ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -1119,13 +1119,14 @@ def test_multi_column_dtype_assignment(): df["b"] = 0 tm.assert_frame_equal(df, expected) + def test_numpy_s3_dtype_on_index(): # GH #21470 - index = pd.Index(['abcd', '1234'], dtype='S3') - expected = pd.Index(['abc', '123'], dtype='S3') - assert index.dtype == pandas_dtype('object') + index = pd.Index(["abcd", "1234"], dtype="S3") + expected = pd.Index(["abc", "123"], dtype="S3") + assert index.dtype == pandas_dtype("object") tm.assert_frame_equal(index, expected) - index = pd.Index(['abcd', '1234']) - expected = pd.Index(['abc', '123'], dtype='S3') - tm.assert_frame_equal(index.astype('S3'), expected) \ No newline at end of file + index = pd.Index(["abcd", "1234"]) + expected = pd.Index(["abc", "123"], dtype="S3") + tm.assert_frame_equal(index.astype("S3"), expected) From b76123f0aa70e4f71af7fdfd3be44eddf0ae17c4 Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Wed, 22 Feb 2023 17:59:32 +0100 Subject: [PATCH 003/577] Add comments --- pandas/core/indexes/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e74c6fefda67d..40e855549aa87 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -539,7 +539,9 @@ def __new__( else: # Ensure we get 1-D array of tuples instead of 2D array. data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - if not(len(data) and isinstance(data[0], tuple)): + if not (isinstance(data[0], tuple)): + # GH#21470 we update data to a np array with the correct + # dtype. data = astype_array(data, dtype=dtype, copy=copy) return Index(data, dtype=dtype, copy=copy, name=name) From 62fe44e184ace7da1b425f59e0b97acd20b5ca7e Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Wed, 22 Feb 2023 18:09:20 +0100 Subject: [PATCH 004/577] Fix GH numbers --- pandas/core/indexes/base.py | 4 ++-- pandas/tests/dtypes/test_dtypes.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 40e855549aa87..bf338b0ed94f4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -517,7 +517,7 @@ def __new__( else: if tupleize_cols: - # GH21470: convert iterable to list before determining if empty + # GH50127: convert iterable to list before determining if empty if is_iterator(data): data = list(data) @@ -540,7 +540,7 @@ def __new__( # Ensure we get 1-D array of tuples instead of 2D array. data = com.asarray_tuplesafe(data, dtype=_dtype_obj) if not (isinstance(data[0], tuple)): - # GH#21470 we update data to a np array with the correct + # GH#50127 we update data to a np array with the correct # dtype. data = astype_array(data, dtype=dtype, copy=copy) return Index(data, dtype=dtype, copy=copy, name=name) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index dfd1abe78187f..00bf38274a4d7 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1121,7 +1121,7 @@ def test_multi_column_dtype_assignment(): def test_numpy_s3_dtype_on_index(): - # GH #21470 + # GH #50127 index = pd.Index(["abcd", "1234"], dtype="S3") expected = pd.Index(["abc", "123"], dtype="S3") assert index.dtype == pandas_dtype("object") From a1a14f51bf1015b15c04f00b5227ff4ad8306b54 Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Wed, 22 Feb 2023 18:36:39 +0100 Subject: [PATCH 005/577] Modify whatsnew and add add make sure asarray_tuplesafe works in base.py --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/indexes/base.py | 11 +++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 45a5d139349e9..517b0926c39f1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1418,6 +1418,7 @@ Other - Bug in :meth:`Series.searchsorted` inconsistent behavior when accepting :class:`DataFrame` as parameter ``value`` (:issue:`49620`) - Bug in :func:`array` failing to raise on :class:`DataFrame` inputs (:issue:`51167`) +- Bug in :class:`Index` when created or converted to the numpy dtype S3 would raise ``NotImplementedError`` (:issue:`50127`) .. --------------------------------------------------------------------------- .. _whatsnew_200.contributors: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bf338b0ed94f4..f026b2eafff3e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -536,14 +536,13 @@ def __new__( if len(data) == 0: # unlike Series, we default to object dtype: data = np.array(data, dtype=object) - else: + elif isinstance(data[0], tuple): # Ensure we get 1-D array of tuples instead of 2D array. data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - if not (isinstance(data[0], tuple)): - # GH#50127 we update data to a np array with the correct - # dtype. - data = astype_array(data, dtype=dtype, copy=copy) - return Index(data, dtype=dtype, copy=copy, name=name) + elif isinstance(dtype, np.dtype): + # GH#50127 we update data to a np array with the correct dtype. + data = com.asarray_tuplesafe(data, dtype=dtype) + return Index(data, dtype=dtype, copy=copy, name=name) try: arr = sanitize_array(data, None, dtype=dtype, copy=copy) From 07c087172585724fb3d02849754b30a413d3bfb3 Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Mon, 27 Feb 2023 20:23:43 +0100 Subject: [PATCH 006/577] Add granular if to avoid unpredicted side effects --- pandas/core/indexes/base.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f026b2eafff3e..919d6e12d4727 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -501,10 +501,11 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - # GH#21470 we must update the `dtype` to avoid passing e.g. the - # numpy type `S3` to `_dtype_to_subclass`, which would raise a - # NotImplementedError. - dtype = _dtype_obj + # # GH#50127 we must update the `dtype` when we have the numpy + # # type `S` to `_dtype_to_subclass`, because it would raise a + # # `NotImplementedError``. + if dtype and dtype.kind == "S": + dtype = _dtype_obj elif is_scalar(data): raise cls._raise_scalar_data_error(data) @@ -517,7 +518,7 @@ def __new__( else: if tupleize_cols: - # GH50127: convert iterable to list before determining if empty + # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) @@ -539,9 +540,9 @@ def __new__( elif isinstance(data[0], tuple): # Ensure we get 1-D array of tuples instead of 2D array. data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - elif isinstance(dtype, np.dtype): - # GH#50127 we update data to a np array with the correct dtype. - data = com.asarray_tuplesafe(data, dtype=dtype) + elif dtype and dtype.kind == "S": + # GH#50127 we update data to a np.array with the correct dtype. + data = astype_array(np.array(data, copy=copy), dtype=dtype, copy=copy) return Index(data, dtype=dtype, copy=copy, name=name) try: From 479eeeec74acd35d4c63679b36a12a541079b7ea Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Mon, 27 Feb 2023 23:53:44 +0100 Subject: [PATCH 007/577] Use ssert_index_equal instead of frame --- pandas/tests/dtypes/test_dtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 00bf38274a4d7..8e19b3e677324 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1125,8 +1125,8 @@ def test_numpy_s3_dtype_on_index(): index = pd.Index(["abcd", "1234"], dtype="S3") expected = pd.Index(["abc", "123"], dtype="S3") assert index.dtype == pandas_dtype("object") - tm.assert_frame_equal(index, expected) + tm.assert_index_equal(index, expected) index = pd.Index(["abcd", "1234"]) expected = pd.Index(["abc", "123"], dtype="S3") - tm.assert_frame_equal(index.astype("S3"), expected) + tm.assert_index_equal(index.astype("S3"), expected) From b5f7b091ce713ad26488f017050340d392a04a9d Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Wed, 15 Mar 2023 16:42:01 +0100 Subject: [PATCH 008/577] Removed recursion, and removed double # --- pandas/core/indexes/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a08b422bf17c5..73e6ebaab1a98 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -501,9 +501,9 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced data = com.asarray_tuplesafe(data, dtype=_dtype_obj) - # # GH#50127 we must update the `dtype` when we have the numpy - # # type `S` to `_dtype_to_subclass`, because it would raise a - # # `NotImplementedError``. + # GH#50127 we must update the `dtype` when we have the numpy + # type `S` to `_dtype_to_subclass`, because it would raise a + # `NotImplementedError`. if dtype and dtype.kind == "S": dtype = _dtype_obj @@ -542,8 +542,8 @@ def __new__( data = com.asarray_tuplesafe(data, dtype=_dtype_obj) elif dtype and dtype.kind == "S": # GH#50127 we update data to a np.array with the correct dtype. - data = astype_array(np.array(data, copy=copy), dtype=dtype, copy=copy) - return Index(data, dtype=dtype, copy=copy, name=name) + # data = astype_array(np.array(data, copy=copy), dtype=dtype, copy=copy) + dtype = _dtype_obj try: arr = sanitize_array(data, None, dtype=dtype, copy=copy) From c834e667f4c41e636d3a89569e736cde28c97a95 Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Wed, 15 Mar 2023 16:49:03 +0100 Subject: [PATCH 009/577] Keep data update to match correct type --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 73e6ebaab1a98..8885ad6077c22 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -542,7 +542,7 @@ def __new__( data = com.asarray_tuplesafe(data, dtype=_dtype_obj) elif dtype and dtype.kind == "S": # GH#50127 we update data to a np.array with the correct dtype. - # data = astype_array(np.array(data, copy=copy), dtype=dtype, copy=copy) + data = astype_array(np.array(data, copy=copy), dtype=dtype, copy=copy) dtype = _dtype_obj try: From d59ef41f3377bc44bf729e7c0f64ea104ea249f9 Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Tue, 21 Mar 2023 14:31:51 +0100 Subject: [PATCH 010/577] Update test which was failing due to dtype S3 not working --- pandas/tests/io/test_parquet.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b55e97a4fe0ae..202adf2e08d9a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1036,9 +1036,7 @@ def test_columns_dtypes_not_invalid(self, pa): # bytes df.columns = [b"foo", b"bar"] - with pytest.raises(NotImplementedError, match="|S3"): - # Bytes fails on read_parquet - check_round_trip(df, pa) + check_round_trip(df, pa) # python object df.columns = [ From 76cb5cf5e7d2857464909e2ddbc78fface8cd491 Mon Sep 17 00:00:00 2001 From: Vinicius Akira Imaizumi Date: Tue, 21 Mar 2023 14:33:44 +0100 Subject: [PATCH 011/577] Update test which was failing due to dtype S3 not working --- pandas/tests/io/test_parquet.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b55e97a4fe0ae..202adf2e08d9a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1036,9 +1036,7 @@ def test_columns_dtypes_not_invalid(self, pa): # bytes df.columns = [b"foo", b"bar"] - with pytest.raises(NotImplementedError, match="|S3"): - # Bytes fails on read_parquet - check_round_trip(df, pa) + check_round_trip(df, pa) # python object df.columns = [ From 606849dac3bd3bd8f2ea2fae44ded3f4d55c5884 Mon Sep 17 00:00:00 2001 From: Daquisu <50253469+Daquisu@users.noreply.github.com> Date: Sun, 9 Apr 2023 13:51:34 +0200 Subject: [PATCH 012/577] Fix whatsnew message --- doc/source/whatsnew/v2.0.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index caf237fb15163..49f6053708f32 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -32,7 +32,7 @@ Bug fixes Other ~~~~~ -- +- Bug in :class:`Index` where creating or converting to the numpy dtype S3 would raise ``NotImplementedError`` (:issue:`50127`) .. --------------------------------------------------------------------------- .. _whatsnew_201.contributors: From 8170407648fb05b52ca945c81ce711274ad81ef5 Mon Sep 17 00:00:00 2001 From: Daquisu <50253469+Daquisu@users.noreply.github.com> Date: Sun, 9 Apr 2023 13:52:21 +0200 Subject: [PATCH 013/577] Remove message from whatsnew v2.0.0.rst --- doc/source/whatsnew/v2.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f4eaf59fae30c..2ee6ecc4e6cd4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1406,7 +1406,6 @@ Other - Bug in incorrectly accepting dtype strings containing "[pyarrow]" more than once (:issue:`51548`) - Bug in :meth:`Series.searchsorted` inconsistent behavior when accepting :class:`DataFrame` as parameter ``value`` (:issue:`49620`) - Bug in :func:`array` failing to raise on :class:`DataFrame` inputs (:issue:`51167`) -- Bug in :class:`Index` when created or converted to the numpy dtype S3 would raise ``NotImplementedError`` (:issue:`50127`) .. --------------------------------------------------------------------------- .. _whatsnew_200.contributors: From e1c88ae646baa9b25e265c309a538d1b6c09df04 Mon Sep 17 00:00:00 2001 From: Daquisu <50253469+Daquisu@users.noreply.github.com> Date: Sun, 9 Apr 2023 18:35:16 +0200 Subject: [PATCH 014/577] Simplify convertion to np array for GH#50127 --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 442fe77ef25f8..bb01dc25d0d35 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -553,7 +553,7 @@ def __new__( data = com.asarray_tuplesafe(data, dtype=_dtype_obj) elif dtype and dtype.kind == "S": # GH#50127 we update data to a np.array with the correct dtype. - data = astype_array(np.array(data, copy=copy), dtype=dtype, copy=copy) + data = np.array(data, dtype=object, copy=copy) dtype = _dtype_obj try: From 5670bc1d360ea9668c5549b7cfa5bfe03fd752ad Mon Sep 17 00:00:00 2001 From: Daquisu <50253469+Daquisu@users.noreply.github.com> Date: Sat, 22 Apr 2023 10:35:52 +0200 Subject: [PATCH 015/577] Set copy=false after new data array, and adjust texts --- doc/source/whatsnew/v2.0.1.rst | 2 +- pandas/core/indexes/base.py | 1 + pandas/tests/dtypes/test_dtypes.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 49f6053708f32..f6a5c5c8babf8 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -32,7 +32,7 @@ Bug fixes Other ~~~~~ -- Bug in :class:`Index` where creating or converting to the numpy dtype S3 would raise ``NotImplementedError`` (:issue:`50127`) +- Bug in :class:`Index` where creating or converting to numpy string dtypes would raise ``NotImplementedError`` (:issue:`50127`) .. --------------------------------------------------------------------------- .. _whatsnew_201.contributors: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bb01dc25d0d35..65e3bb0ad4835 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -554,6 +554,7 @@ def __new__( elif dtype and dtype.kind == "S": # GH#50127 we update data to a np.array with the correct dtype. data = np.array(data, dtype=object, copy=copy) + copy = False dtype = _dtype_obj try: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 039c684e3c753..45cc38c6aad4f 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1141,7 +1141,7 @@ def test_multi_column_dtype_assignment(): def test_numpy_s3_dtype_on_index(): - # GH #50127 + # GH#50127 index = pd.Index(["abcd", "1234"], dtype="S3") expected = pd.Index(["abc", "123"], dtype="S3") assert index.dtype == pandas_dtype("object") From 88ef9fdbe565cd0f377a858e56be7ab649db77e2 Mon Sep 17 00:00:00 2001 From: Daquisu <50253469+Daquisu@users.noreply.github.com> Date: Sat, 22 Apr 2023 11:27:00 +0200 Subject: [PATCH 016/577] Alphabetical order for whatsnew, dtype=dtype instead of object --- doc/source/whatsnew/v2.0.1.rst | 2 +- pandas/core/indexes/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 45c90e6032b6f..332a0243cdd75 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -47,11 +47,11 @@ Bug fixes Other ~~~~~ +- Bug in :class:`Index` where creating or converting to numpy string dtypes would raise ``NotImplementedError`` (:issue:`50127`) - :class:`DataFrame` created from empty dicts had :attr:`~DataFrame.columns` of dtype ``object``. It is now a :class:`RangeIndex` (:issue:`52404`) - :class:`Series` created from empty dicts had :attr:`~Series.index` of dtype ``object``. It is now a :class:`RangeIndex` (:issue:`52404`) - Implemented :meth:`Series.str.split` and :meth:`Series.str.rsplit` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`52401`) - Implemented most ``str`` accessor methods for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`52401`) -- Bug in :class:`Index` where creating or converting to numpy string dtypes would raise ``NotImplementedError`` (:issue:`50127`) .. --------------------------------------------------------------------------- .. _whatsnew_201.contributors: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2b018c2effc01..8c557c2824ed4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -554,7 +554,7 @@ def __new__( data = com.asarray_tuplesafe(data, dtype=_dtype_obj) elif dtype and dtype.kind == "S": # GH#50127 we update data to a np.array with the correct dtype. - data = np.array(data, dtype=object, copy=copy) + data = np.array(data, dtype=dtype, copy=copy) copy = False dtype = _dtype_obj From 6120ddd76af38432e68a038b6bea3fb5bd4b9f2b Mon Sep 17 00:00:00 2001 From: Daquisu <50253469+Daquisu@users.noreply.github.com> Date: Sat, 22 Apr 2023 11:35:46 +0200 Subject: [PATCH 017/577] Fix precommit --- doc/source/whatsnew/v2.0.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 332a0243cdd75..a322235e89e78 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -47,9 +47,9 @@ Bug fixes Other ~~~~~ -- Bug in :class:`Index` where creating or converting to numpy string dtypes would raise ``NotImplementedError`` (:issue:`50127`) - :class:`DataFrame` created from empty dicts had :attr:`~DataFrame.columns` of dtype ``object``. It is now a :class:`RangeIndex` (:issue:`52404`) - :class:`Series` created from empty dicts had :attr:`~Series.index` of dtype ``object``. It is now a :class:`RangeIndex` (:issue:`52404`) +- Bug in :class:`Index` where creating or converting to numpy string dtypes would raise ``NotImplementedError`` (:issue:`50127`) - Implemented :meth:`Series.str.split` and :meth:`Series.str.rsplit` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`52401`) - Implemented most ``str`` accessor methods for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`52401`) From 39769ca7aa56d08e58e638445be15632ed2641d7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 22 Apr 2023 02:43:36 -0700 Subject: [PATCH 018/577] BUG: dt.round with equal/higher freq resolution would not noop (#52841) --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/arrays/datetimelike.py | 8 ++++++-- pandas/tests/series/accessors/test_dt_accessor.py | 11 +++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index a322235e89e78..0ca1b6f54671d 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -27,6 +27,7 @@ Bug fixes ~~~~~~~~~ - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) +- Bug in :func:`Series.dt.round` when passing a ``freq`` of equal or higher resolution compared to the :class:`Series` would raise a ``ZeroDivisionError`` (:issue:`52761`) - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 234d26186aab0..cb757a412f34d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2050,8 +2050,12 @@ def _round(self, freq, mode, ambiguous, nonexistent): values = self.view("i8") values = cast(np.ndarray, values) - nanos = to_offset(freq).nanos # raises on non-fixed frequencies - nanos = delta_to_nanoseconds(to_offset(freq), self._creso) + offset = to_offset(freq) + offset.nanos # raises on non-fixed frequencies + nanos = delta_to_nanoseconds(offset, self._creso) + if nanos == 0: + # GH 52761 + return self result_i8 = round_nsint64(values, mode, nanos) result = self._maybe_mask_results(result_i8, fill_value=iNaT) result = result.view(self._ndarray.dtype) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 21c1e9ca84a35..aed5079a33e5b 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -385,6 +385,17 @@ def test_dt_round_tz_nonexistent(self, method, ts_str, freq): with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): getattr(ser.dt, method)(freq, nonexistent="raise") + @pytest.mark.parametrize("freq", ["ns", "U", "1000U"]) + def test_dt_round_nonnano_higher_resolution_no_op(self, freq): + # GH 52761 + ser = Series( + ["2020-05-31 08:00:00", "2000-12-31 04:00:05", "1800-03-14 07:30:20"], + dtype="datetime64[ms]", + ) + expected = ser.copy() + result = ser.dt.round(freq) + tm.assert_series_equal(result, expected) + def test_dt_namespace_accessor_categorical(self): # GH 19468 dti = DatetimeIndex(["20171111", "20181212"]).repeat(2) From fb5c99c8e9bea464686317f05f4def1141eb9f56 Mon Sep 17 00:00:00 2001 From: gmaiwald <80886061+gmaiwald@users.noreply.github.com> Date: Sat, 22 Apr 2023 11:55:18 +0200 Subject: [PATCH 019/577] TST: Test groupby for columns with string objects (#52757) --- pandas/tests/groupby/test_groupby.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b5b13d6b10511..29382be60b08b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2970,3 +2970,14 @@ def test_groupby_numeric_only_std_no_result(numeric_only): ValueError, match="could not convert string to float: 'bar'" ): dfgb.std(numeric_only=numeric_only) + + +@pytest.mark.parametrize("bug_var", [1, "a"]) +def test_groupby_sum_on_nan_should_return_nan(bug_var): + # GH 24196 + df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]}) + dfgb = df.groupby(lambda x: x) + result = dfgb.sum(min_count=1) + + expected_df = DataFrame([bug_var, bug_var, bug_var, np.nan], columns=["A"]) + tm.assert_frame_equal(result, expected_df) From 2aa6e6fb6738f26c029576c2fd7137d939286443 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 22 Apr 2023 03:33:53 -0700 Subject: [PATCH 020/577] DEPR: is_period_dtype, is_sparse (#52642) --- doc/source/user_guide/io.rst | 4 ++++ doc/source/user_guide/scale.rst | 3 +++ doc/source/whatsnew/v0.19.0.rst | 1 + doc/source/whatsnew/v2.1.0.rst | 2 ++ pandas/conftest.py | 4 +++- pandas/core/arrays/datetimes.py | 8 +++++-- pandas/core/arrays/period.py | 5 ++-- pandas/core/dtypes/common.py | 33 +++++++++++++++++--------- pandas/tests/dtypes/test_common.py | 29 ++++++++++++---------- pandas/tests/dtypes/test_dtypes.py | 24 +++++++++---------- pandas/tests/extension/base/missing.py | 3 +-- 11 files changed, 74 insertions(+), 42 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cc8c1299ab137..60353dde5683f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5239,6 +5239,7 @@ See the `Full Documentation `__. Write to a feather file. .. ipython:: python + :okwarning: df.to_feather("example.feather") @@ -5382,6 +5383,7 @@ Serializing a ``DataFrame`` to parquet may include the implicit index as one or more columns in the output file. Thus, this code: .. ipython:: python + :okwarning: df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) df.to_parquet("test.parquet", engine="pyarrow") @@ -5398,6 +5400,7 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to :func:`~pandas.DataFrame.to_parquet`: .. ipython:: python + :okwarning: df.to_parquet("test.parquet", index=False) @@ -5420,6 +5423,7 @@ Partitioning Parquet files Parquet supports partitioning of data based on the values of one or more columns. .. ipython:: python + :okwarning: df = pd.DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}) df.to_parquet(path="test", engine="pyarrow", partition_cols=["a"], compression=None) diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 182f1ca39e0a8..261852692ab2f 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -42,6 +42,7 @@ Suppose our raw dataset on disk has many columns:: That can be generated by the following code snippet: .. ipython:: python + :okwarning: import pandas as pd import numpy as np @@ -106,6 +107,7 @@ referred to as "low-cardinality" data). By using more efficient data types, you can store larger datasets in memory. .. ipython:: python + :okwarning: ts = make_timeseries(freq="30S", seed=0) ts.to_parquet("timeseries.parquet") @@ -183,6 +185,7 @@ Suppose we have an even larger "logical dataset" on disk that's a directory of p files. Each file in the directory represents a different year of the entire dataset. .. ipython:: python + :okwarning: import pathlib diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index ab17cacd830e5..d4b879f137698 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -905,6 +905,7 @@ As a consequence of this change, ``PeriodIndex`` no longer has an integer dtype: **New behavior**: .. ipython:: python + :okwarning: pi = pd.PeriodIndex(["2016-08-01"], freq="D") pi diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index efc8bc695df85..b0e9fa2cea0ee 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -234,6 +234,8 @@ Deprecations - Deprecated :func:`is_datetime64tz_dtype`, check ``isinstance(dtype, pd.DatetimeTZDtype)`` instead (:issue:`52607`) - Deprecated :func:`is_int64_dtype`, check ``dtype == np.dtype(np.int64)`` instead (:issue:`52564`) - Deprecated :func:`is_interval_dtype`, check ``isinstance(dtype, pd.IntervalDtype)`` instead (:issue:`52607`) +- Deprecated :func:`is_period_dtype`, check ``isinstance(dtype, pd.PeriodDtype)`` instead (:issue:`52642`) +- Deprecated :func:`is_sparse`, check ``isinstance(dtype, pd.SparseDtype)`` instead (:issue:`52642`) - Deprecated :meth:`DataFrame.applymap`. Use the new :meth:`DataFrame.map` method instead (:issue:`52353`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) diff --git a/pandas/conftest.py b/pandas/conftest.py index 7773d8de37705..77d2f4802c08f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -137,7 +137,10 @@ def pytest_collection_modifyitems(items, config) -> None: ignored_doctest_warnings = [ ("is_int64_dtype", "is_int64_dtype is deprecated"), ("is_interval_dtype", "is_interval_dtype is deprecated"), + ("is_period_dtype", "is_period_dtype is deprecated"), ("is_datetime64tz_dtype", "is_datetime64tz_dtype is deprecated"), + ("is_categorical_dtype", "is_categorical_dtype is deprecated"), + ("is_sparse", "is_sparse is deprecated"), # Docstring divides by zero to show behavior difference ("missing.mask_zero_div_zero", "divide by zero encountered"), ( @@ -149,7 +152,6 @@ def pytest_collection_modifyitems(items, config) -> None: "(Series|DataFrame).bool is now deprecated and will be removed " "in future version of pandas", ), - ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ] for item in items: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 84743783d2f26..a765f4ae1b21b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -52,7 +52,6 @@ is_bool_dtype, is_dtype_equal, is_float_dtype, - is_sparse, is_string_dtype, pandas_dtype, ) @@ -65,6 +64,7 @@ from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range +from pandas.core.arrays.sparse.dtype import SparseDtype import pandas.core.common as com from pandas.tseries.frequencies import get_period_alias @@ -2038,7 +2038,11 @@ def _sequence_to_dt64ns( if out_unit is not None: out_dtype = np.dtype(f"M8[{out_unit}]") - if data_dtype == object or is_string_dtype(data_dtype) or is_sparse(data_dtype): + if ( + data_dtype == object + or is_string_dtype(data_dtype) + or isinstance(data_dtype, SparseDtype) + ): # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension copy = False diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 95e7135d754cb..542120fd7df0a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -56,7 +56,6 @@ from pandas.core.dtypes.common import ( ensure_object, is_dtype_equal, - is_period_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -172,7 +171,9 @@ class PeriodArray(dtl.DatelikeOps, libperiod.PeriodMixin): # type: ignore[misc] _typ = "periodarray" # ABCPeriodArray _internal_fill_value = np.int64(iNaT) _recognized_scalars = (Period,) - _is_recognized_dtype = is_period_dtype # check_compatible_with checks freq match + _is_recognized_dtype = lambda x: isinstance( + x, PeriodDtype + ) # check_compatible_with checks freq match _infer_matches = ("period",) @property diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b6f3210e35c81..e8f0ed1b80150 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -207,6 +207,12 @@ def is_sparse(arr) -> bool: Returns `False` if the parameter has more than one dimension. """ + warnings.warn( + "is_sparse is deprecated and will be removed in a future " + "version. Check `isinstance(dtype, pd.SparseDtype)` instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) from pandas.core.arrays.sparse import SparseDtype dtype = getattr(arr, "dtype", arr) @@ -399,6 +405,12 @@ def is_period_dtype(arr_or_dtype) -> bool: >>> is_period_dtype(pd.PeriodIndex([], freq="A")) True """ + warnings.warn( + "is_period_dtype is deprecated and will be removed in a future version. " + "Use `isinstance(dtype, pd.PeriodDtype)` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) if isinstance(arr_or_dtype, ExtensionDtype): # GH#33400 fastpath for dtype object return arr_or_dtype.type is Period @@ -539,7 +551,7 @@ def is_string_dtype(arr_or_dtype) -> bool: >>> is_string_dtype(pd.Series([1, 2], dtype=object)) False """ - if hasattr(arr_or_dtype, "dtype") and get_dtype(arr_or_dtype).kind == "O": + if hasattr(arr_or_dtype, "dtype") and _get_dtype(arr_or_dtype).kind == "O": return is_all_strings(arr_or_dtype) def condition(dtype) -> bool: @@ -585,7 +597,7 @@ def is_dtype_equal(source, target) -> bool: # GH#38516 ensure we get the same behavior from # is_dtype_equal(CDT, "category") and CDT == "category" try: - src = get_dtype(source) + src = _get_dtype(source) if isinstance(src, ExtensionDtype): return src == target except (TypeError, AttributeError, ImportError): @@ -594,8 +606,8 @@ def is_dtype_equal(source, target) -> bool: return is_dtype_equal(target, source) try: - source = get_dtype(source) - target = get_dtype(target) + source = _get_dtype(source) + target = _get_dtype(target) return source == target except (TypeError, AttributeError, ImportError): # invalid comparison @@ -875,7 +887,7 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: return False try: - tipo = get_dtype(arr_or_dtype) + tipo = _get_dtype(arr_or_dtype) except TypeError: return False return (isinstance(tipo, np.dtype) and tipo.kind == "M") or isinstance( @@ -923,7 +935,7 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: if arr_or_dtype is None: return False try: - tipo = get_dtype(arr_or_dtype) + tipo = _get_dtype(arr_or_dtype) except TypeError: return False return tipo == DT64NS_DTYPE or ( @@ -1214,7 +1226,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: if arr_or_dtype is None: return False try: - dtype = get_dtype(arr_or_dtype) + dtype = _get_dtype(arr_or_dtype) except (TypeError, ValueError): return False @@ -1373,13 +1385,13 @@ def _is_dtype(arr_or_dtype, condition) -> bool: if arr_or_dtype is None: return False try: - dtype = get_dtype(arr_or_dtype) + dtype = _get_dtype(arr_or_dtype) except (TypeError, ValueError): return False return condition(dtype) -def get_dtype(arr_or_dtype) -> DtypeObj: +def _get_dtype(arr_or_dtype) -> DtypeObj: """ Get the dtype instance associated with an array or dtype object. @@ -1510,7 +1522,7 @@ def infer_dtype_from_object(dtype) -> type: try: return infer_dtype_from_object(getattr(np, dtype)) except (AttributeError, TypeError): - # Handles cases like get_dtype(int) i.e., + # Handles cases like _get_dtype(int) i.e., # Python objects that are valid dtypes # (unlike user-defined types, in general) # @@ -1653,7 +1665,6 @@ def is_all_strings(value: ArrayLike) -> bool: "ensure_float64", "ensure_python_int", "ensure_str", - "get_dtype", "infer_dtype_from_object", "INT64_DTYPE", "is_1d_only_ea_dtype", diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index a28a5b62f4ad0..85fbac186b369 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -177,6 +177,7 @@ def test_get_dtype_error_catch(func): or func is com.is_interval_dtype or func is com.is_datetime64tz_dtype or func is com.is_categorical_dtype + or func is com.is_period_dtype ): warn = FutureWarning @@ -197,14 +198,16 @@ def test_is_object(): "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) def test_is_sparse(check_scipy): - assert com.is_sparse(SparseArray([1, 2, 3])) + msg = "is_sparse is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert com.is_sparse(SparseArray([1, 2, 3])) - assert not com.is_sparse(np.array([1, 2, 3])) + assert not com.is_sparse(np.array([1, 2, 3])) - if check_scipy: - import scipy.sparse + if check_scipy: + import scipy.sparse - assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3])) + assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3])) @td.skip_if_no_scipy @@ -264,12 +267,14 @@ def test_is_timedelta64_dtype(): def test_is_period_dtype(): - assert not com.is_period_dtype(object) - assert not com.is_period_dtype([1, 2, 3]) - assert not com.is_period_dtype(pd.Period("2017-01-01")) + msg = "is_period_dtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not com.is_period_dtype(object) + assert not com.is_period_dtype([1, 2, 3]) + assert not com.is_period_dtype(pd.Period("2017-01-01")) - assert com.is_period_dtype(PeriodDtype(freq="D")) - assert com.is_period_dtype(pd.PeriodIndex([], freq="A")) + assert com.is_period_dtype(PeriodDtype(freq="D")) + assert com.is_period_dtype(pd.PeriodIndex([], freq="A")) def test_is_interval_dtype(): @@ -681,7 +686,7 @@ def test_is_complex_dtype(): ], ) def test_get_dtype(input_param, result): - assert com.get_dtype(input_param) == result + assert com._get_dtype(input_param) == result @pytest.mark.parametrize( @@ -700,7 +705,7 @@ def test_get_dtype_fails(input_param, expected_error_message): # 2020-02-02 npdev changed error message expected_error_message += f"|Cannot interpret '{input_param}' as a data type" with pytest.raises(TypeError, match=expected_error_message): - com.get_dtype(input_param) + com._get_dtype(input_param) @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index e99598179f79a..5b4e2ea7d9035 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -428,12 +428,10 @@ def test_construction(self): for s in ["period[D]", "Period[D]", "D"]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Day() - assert is_period_dtype(dt) for s in ["period[3D]", "Period[3D]", "3D"]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Day(3) - assert is_period_dtype(dt) for s in [ "period[26H]", @@ -445,7 +443,6 @@ def test_construction(self): ]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Hour(26) - assert is_period_dtype(dt) def test_cannot_use_custom_businessday(self): # GH#52534 @@ -531,20 +528,22 @@ def test_equality(self, dtype): assert not is_dtype_equal(PeriodDtype("D"), PeriodDtype("2D")) def test_basic(self, dtype): - assert is_period_dtype(dtype) + msg = "is_period_dtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert is_period_dtype(dtype) - pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="H") + pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="H") - assert is_period_dtype(pidx.dtype) - assert is_period_dtype(pidx) + assert is_period_dtype(pidx.dtype) + assert is_period_dtype(pidx) - s = Series(pidx, name="A") + s = Series(pidx, name="A") - assert is_period_dtype(s.dtype) - assert is_period_dtype(s) + assert is_period_dtype(s.dtype) + assert is_period_dtype(s) - assert not is_period_dtype(np.dtype("float64")) - assert not is_period_dtype(1.0) + assert not is_period_dtype(np.dtype("float64")) + assert not is_period_dtype(1.0) def test_freq_argument_required(self): # GH#27388 @@ -1133,6 +1132,7 @@ def test_is_dtype_no_warning(check): check is is_categorical_dtype or check is is_interval_dtype or check is is_datetime64tz_dtype + or check is is_period_dtype ): warn = FutureWarning diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index cab81f864d8d8..8a53c06e0b7bf 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -3,7 +3,6 @@ import pandas as pd import pandas._testing as tm -from pandas.api.types import is_sparse from pandas.tests.extension.base.base import BaseExtensionTests @@ -28,7 +27,7 @@ def test_isna_returns_copy(self, data_missing, na_func): result = pd.Series(data_missing) expected = result.copy() mask = getattr(result, na_func)() - if is_sparse(mask): + if isinstance(mask.dtype, pd.SparseDtype): mask = np.array(mask) mask[:] = True From aad307720e10371d1be4af32fedfbddf1e650aaa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 22 Apr 2023 03:38:48 -0700 Subject: [PATCH 021/577] BUG: Non unitless np NaT arithmetic with non-nano (#52821) --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/ops/array_ops.py | 25 +++++++++++++-- pandas/tests/arithmetic/test_datetime64.py | 37 ++++++++++++++++++++++ pandas/tests/arithmetic/test_numeric.py | 1 + 4 files changed, 61 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 0ca1b6f54671d..5689c915fb254 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -39,6 +39,7 @@ Bug fixes - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`) +- Bug in arithmetic between ``np.datetime64`` and ``np.timedelta64`` ``NaT`` scalars with units always returning nanosecond resolution (:issue:`52295`) - Bug in logical and comparison operations between :class:`ArrowDtype` and numpy masked types (e.g. ``"boolean"``) (:issue:`52625`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) - Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 8b39089bfb1d5..b39930da9f711 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -22,7 +22,14 @@ lib, ops as libops, ) -from pandas._libs.tslibs import BaseOffset +from pandas._libs.tslibs import ( + BaseOffset, + get_supported_reso, + get_unit_from_dtype, + is_supported_unit, + is_unitless, + npy_unit_to_abbrev, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( @@ -533,7 +540,13 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): from pandas.core.arrays import DatetimeArray # Avoid possible ambiguities with pd.NaT - obj = obj.astype("datetime64[ns]") + # GH 52295 + if is_unitless(obj.dtype): + obj = obj.astype("datetime64[ns]") + elif not is_supported_unit(get_unit_from_dtype(obj.dtype)): + unit = get_unit_from_dtype(obj.dtype) + closest_unit = npy_unit_to_abbrev(get_supported_reso(unit)) + obj = obj.astype(f"datetime64[{closest_unit}]") right = np.broadcast_to(obj, shape) return DatetimeArray(right) @@ -546,7 +559,13 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): # wrapping timedelta64("NaT") in Timedelta returns NaT, # which would incorrectly be treated as a datetime-NaT, so # we broadcast and wrap in a TimedeltaArray - obj = obj.astype("timedelta64[ns]") + # GH 52295 + if is_unitless(obj.dtype): + obj = obj.astype("timedelta64[ns]") + elif not is_supported_unit(get_unit_from_dtype(obj.dtype)): + unit = get_unit_from_dtype(obj.dtype) + closest_unit = npy_unit_to_abbrev(get_supported_reso(unit)) + obj = obj.astype(f"timedelta64[{closest_unit}]") right = np.broadcast_to(obj, shape) return TimedeltaArray(right) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index b9c23d5029e22..6a0584485be42 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -2436,3 +2436,40 @@ def test_dt64arr_addsub_object_dtype_2d(): assert result2.shape == (4, 1) assert all(td._value == 0 for td in result2.ravel()) + + +def test_non_nano_dt64_addsub_np_nat_scalars(): + # GH 52295 + ser = Series([1233242342344, 232432434324, 332434242344], dtype="datetime64[ms]") + result = ser - np.datetime64("nat", "ms") + expected = Series([NaT] * 3, dtype="timedelta64[ms]") + tm.assert_series_equal(result, expected) + + result = ser + np.timedelta64("nat", "ms") + expected = Series([NaT] * 3, dtype="datetime64[ms]") + tm.assert_series_equal(result, expected) + + +def test_non_nano_dt64_addsub_np_nat_scalars_unitless(): + # GH 52295 + # TODO: Can we default to the ser unit? + ser = Series([1233242342344, 232432434324, 332434242344], dtype="datetime64[ms]") + result = ser - np.datetime64("nat") + expected = Series([NaT] * 3, dtype="timedelta64[ns]") + tm.assert_series_equal(result, expected) + + result = ser + np.timedelta64("nat") + expected = Series([NaT] * 3, dtype="datetime64[ns]") + tm.assert_series_equal(result, expected) + + +def test_non_nano_dt64_addsub_np_nat_scalars_unsupported_unit(): + # GH 52295 + ser = Series([12332, 23243, 33243], dtype="datetime64[s]") + result = ser - np.datetime64("nat", "D") + expected = Series([NaT] * 3, dtype="timedelta64[s]") + tm.assert_series_equal(result, expected) + + result = ser + np.timedelta64("nat", "D") + expected = Series([NaT] * 3, dtype="datetime64[s]") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index eaf80f4768458..a03c69d8e849c 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -292,6 +292,7 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array np.datetime64("NaT", "ns"), pd.NaT, ], + ids=repr, ) def test_add_sub_datetimedeltalike_invalid( self, numeric_idx, other, box_with_array From 92249ab4dc5b31dc4d251c3184332c9f71fc6bf2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 22 Apr 2023 13:22:04 -0400 Subject: [PATCH 022/577] REGR: SeriesGroupBy.agg with multiple categoricals, as_index=False, and a list fails (#52850) --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/groupby/generic.py | 4 +- pandas/tests/groupby/test_categorical.py | 48 ++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 5689c915fb254..b203c65b034c6 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) +- Fixed regression in :meth:`SeriesGroupBy.agg` failing when grouping with categorical data, multiple groupings, ``as_index=False``, and a list of aggregations (:issue:`52760`) .. --------------------------------------------------------------------------- .. _whatsnew_201.bug_fixes: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2b68c002fe49f..d26448dffc11a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -245,8 +245,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) assert columns is not None # for mypy ret.columns = columns if not self.as_index: - ret = self._insert_inaxis_grouper(ret) - ret.index = default_index(len(ret)) + ret = ret.reset_index() return ret else: @@ -352,7 +351,6 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: output = self.obj._constructor_expanddim(indexed_output, index=None) output.columns = Index(key.label for key in results) - output = self._reindex_output(output) return output def _wrap_applied_output( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index abce7dcfef444..f651609484f39 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -14,6 +14,7 @@ qcut, ) import pandas._testing as tm +from pandas.api.typing import SeriesGroupBy from pandas.tests.groupby import get_groupby_method_args @@ -2036,3 +2037,50 @@ def test_groupby_default_depr(cat_columns, keys): klass = FutureWarning if set(cat_columns) & set(keys) else None with tm.assert_produces_warning(klass, match=msg): df.groupby(keys) + + +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_agg_list(request, as_index, observed, reduction_func, test_series, keys): + # GH#52760 + if test_series and reduction_func == "corrwith": + assert not hasattr(SeriesGroupBy, "corrwith") + pytest.skip("corrwith not implemented for SeriesGroupBy") + elif reduction_func == "corrwith": + msg = "GH#32293: attempts to call SeriesGroupBy.corrwith" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + elif ( + reduction_func == "nunique" + and not test_series + and len(keys) != 1 + and not observed + and not as_index + ): + msg = "GH#52848 - raises a ValueError" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + + df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) + df = df.astype({"a1": "category", "a2": "category"}) + if "a2" not in keys: + df = df.drop(columns="a2") + gb = df.groupby(by=keys, as_index=as_index, observed=observed) + if test_series: + gb = gb["b"] + args = get_groupby_method_args(reduction_func, df) + + result = gb.agg([reduction_func], *args) + expected = getattr(gb, reduction_func)(*args) + + if as_index and (test_series or reduction_func == "size"): + expected = expected.to_frame(reduction_func) + if not test_series: + if not as_index: + # TODO: GH#52849 - as_index=False is not respected + expected = expected.set_index(keys) + expected.columns = MultiIndex( + levels=[["b"], [reduction_func]], codes=[[0], [0]] + ) + elif not as_index: + expected.columns = keys + [reduction_func] + + tm.assert_equal(result, expected) From bf535f16097f9dfaaa94b96f1cc4bf12e03bdb54 Mon Sep 17 00:00:00 2001 From: MirijaH <131352550+MirijaH@users.noreply.github.com> Date: Sat, 22 Apr 2023 19:23:42 +0200 Subject: [PATCH 023/577] Issue 21932: Added Regression Test for index of pivot_table on empty table (#52809) --- pandas/tests/reshape/test_pivot.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3fc617b91b866..efd5ec2f0868f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2598,3 +2598,11 @@ def test_pivot_not_changing_index_name(self): expected = df.copy(deep=True) df.pivot(index="one", columns="two", values="three") tm.assert_frame_equal(df, expected) + + def test_pivot_table_empty_dataframe_correct_index(self): + # GH 21932 + df = DataFrame([], columns=["a", "b", "value"]) + pivot = df.pivot_table(index="a", columns="b", values="value", aggfunc="count") + + expected = Index([], dtype="object", name="b") + tm.assert_index_equal(pivot.columns, expected) From 8688ecfbe8410dc0cb2bf98f59c25d91cfee72f6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 22 Apr 2023 16:18:11 -0400 Subject: [PATCH 024/577] TST/CLN: Prepare moving tests out of groupby/test_allowlist (#52851) --- pandas/tests/groupby/test_allowlist.py | 71 +++++++------------------- 1 file changed, 19 insertions(+), 52 deletions(-) diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index 4dffad2b6b462..d495441593aed 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -3,72 +3,38 @@ Do not add tests here! """ -from string import ascii_lowercase - -import numpy as np import pytest from pandas import ( DataFrame, - Series, date_range, ) import pandas._testing as tm -AGG_FUNCTIONS = [ - "sum", - "prod", - "min", - "max", - "median", - "mean", - "skew", - "std", - "var", - "sem", -] -AGG_FUNCTIONS_WITH_SKIPNA = ["skew"] - - -@pytest.fixture -def df(): - return DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "B": ["one", "one", "two", "three", "two", "two", "one", "three"], - "C": np.random.randn(8), - "D": np.random.randn(8), - } - ) - - -@pytest.fixture -def df_letters(): - letters = np.array(list(ascii_lowercase)) - N = 10 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame( - { - "floats": N / 10 * Series(np.random.random(N)), - "letters": Series(random_letters), - } - ) - return df - -@pytest.fixture -def raw_frame(): - return DataFrame([0]) - - -@pytest.mark.parametrize("op", AGG_FUNCTIONS) +@pytest.mark.parametrize( + "op", + [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "std", + "var", + "sem", + ], +) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) -def test_regression_allowlist_methods(raw_frame, op, axis, skipna, sort): +def test_regression_allowlist_methods(op, axis, skipna, sort): # GH6944 # GH 17537 # explicitly test the allowlist methods + raw_frame = DataFrame([0]) if axis == 0: frame = raw_frame msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be" @@ -79,7 +45,8 @@ def test_regression_allowlist_methods(raw_frame, op, axis, skipna, sort): with tm.assert_produces_warning(FutureWarning, match=msg): grouped = frame.groupby(level=0, axis=axis, sort=sort) - if op in AGG_FUNCTIONS_WITH_SKIPNA: + if op == "skew": + # skew has skipna result = getattr(grouped, op)(skipna=skipna) expected = frame.groupby(level=0).apply( lambda h: getattr(h, op)(axis=axis, skipna=skipna) From bf8485119cf67846678a72ae9adad89cbe8fe141 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 22 Apr 2023 21:51:56 +0100 Subject: [PATCH 025/577] CLN/depr: dt.to_pydatetime (#52803) --- pandas/core/indexes/accessors.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 4f529b71c867f..f86728ad8b686 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -304,6 +304,12 @@ def to_pydatetime(self) -> np.ndarray: """ Return the data as an array of :class:`datetime.datetime` objects. + .. deprecated:: 2.1.0 + + The current behavior of dt.to_pydatetime is deprecated. + In a future version this will return a Series containing python + datetime objects instead of a ndarray. + Timezone information is retained if present. .. warning:: From 42273e9d94d84e36462473535b880c46dd690e0f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 23 Apr 2023 04:43:39 -0400 Subject: [PATCH 026/577] BUG: pyarrow duration arrays constructed from data containing NaT can overflow (#52843) --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/arrays/arrow/array.py | 8 +++++++- pandas/tests/extension/test_arrow.py | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index b203c65b034c6..cdc78dc60ea01 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -28,6 +28,7 @@ Bug fixes ~~~~~~~~~ - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) +- Bug in :class:`~arrays.ArrowExtensionArray` with duration dtype overflowing when constructed from data containing numpy ``NaT`` (:issue:`52843`) - Bug in :func:`Series.dt.round` when passing a ``freq`` of equal or higher resolution compared to the :class:`Series` would raise a ``ZeroDivisionError`` (:issue:`52761`) - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d83bf9d340993..51d6fa74ea94e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -258,7 +258,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal scalars = pa.array(scalars, from_pandas=True) if pa_dtype and scalars.type != pa_dtype: scalars = scalars.cast(pa_dtype) - return cls(scalars) + arr = cls(scalars) + if pa.types.is_duration(scalars.type) and scalars.null_count > 0: + # GH52843: upstream bug for duration types when originally + # constructed with data containing numpy NaT. + # https://github.com/apache/arrow/issues/35088 + arr = arr.fillna(arr.dtype.na_value) + return arr @classmethod def _from_sequence_of_strings( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9191560bd9a68..0300b271acc3f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2830,3 +2830,19 @@ def test_date32_repr(): arrow_dt = pa.array([date.fromisoformat("2020-01-01")], type=pa.date32()) ser = pd.Series(arrow_dt, dtype=ArrowDtype(arrow_dt.type)) assert repr(ser) == "0 2020-01-01\ndtype: date32[day][pyarrow]" + + +@pytest.mark.xfail( + pa_version_under8p0, + reason="Function 'add_checked' has no kernel matching input types", + raises=pa.ArrowNotImplementedError, +) +def test_duration_overflow_from_ndarray_containing_nat(): + # GH52843 + data_ts = pd.to_datetime([1, None]) + data_td = pd.to_timedelta([1, None]) + ser_ts = pd.Series(data_ts, dtype=ArrowDtype(pa.timestamp("ns"))) + ser_td = pd.Series(data_td, dtype=ArrowDtype(pa.duration("ns"))) + result = ser_ts + ser_td + expected = pd.Series([2, None], dtype=ArrowDtype(pa.timestamp("ns"))) + tm.assert_series_equal(result, expected) From cc3b50073968eaf54772d7a1863e7aaf68ef1e17 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 23 Apr 2023 12:02:53 +0200 Subject: [PATCH 027/577] CI: Unpin matplotlib (#51990) --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 2 +- environment.yml | 2 +- pandas/plotting/_core.py | 2 +- pandas/plotting/_misc.py | 27 +++++++++-------------- pandas/tests/plotting/test_series.py | 4 ++-- requirements-dev.txt | 2 +- 11 files changed, 21 insertions(+), 28 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index f159b71a1b48c..83b476363e9e3 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -32,7 +32,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 6da92a28965a2..70a674417e01e 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -32,7 +32,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 # - numba>=0.55.2 not compatible with 3.11 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index dbfcc535fe3fb..670d7c37dc4d2 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -33,7 +33,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 86877c5f1c263..df7d0277f3ba9 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -32,7 +32,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index bc89fa7bbb8b9..7b60eec7696cc 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -32,7 +32,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 85c4b82d55387..5102b2ca55404 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -32,7 +32,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 diff --git a/environment.yml b/environment.yml index abbf83fea2508..bde8c46bffd97 100644 --- a/environment.yml +++ b/environment.yml @@ -35,7 +35,7 @@ dependencies: - ipython - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 # pin for "Run checks on imported code" job - openpyxl<3.1.1, >=3.0.7 diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 75af0c7bdae79..6ef33c3d58306 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -393,7 +393,7 @@ def hist_frame( >>> boxplot = df.boxplot(column=['Col1', 'Col2'], return_type='axes') >>> type(boxplot) - + When grouping with ``by``, a Series mapping columns to ``return_type`` is returned: diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index cbfdbc10ad71c..f8f9a584f0563 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -143,22 +143,15 @@ def scatter_matrix( >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> pd.plotting.scatter_matrix(df, alpha=0.2) - array([[, - , - , - ], - [, - , - , - ], - [, - , - , - ], - [, - , - , - ]], dtype=object) + array([[, , + , ], + [, , + , ], + [, , + , ], + [, , + , ]], + dtype=object) """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.scatter_matrix( @@ -509,7 +502,7 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax :context: close-figs >>> pd.plotting.lag_plot(s, lag=1) - + """ plot_backend = _get_plot_backend("matplotlib") return plot_backend.lag_plot(series=series, lag=lag, ax=ax, **kwds) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index c294e9c23882d..755a1811c1356 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -456,8 +456,8 @@ def test_df_series_secondary_legend(self): ) def test_secondary_logy(self, input_logy, expected_scale): # GH 25545 - s1 = Series(np.random.randn(30)) - s2 = Series(np.random.randn(30)) + s1 = Series(np.random.randn(100)) + s2 = Series(np.random.randn(100)) # GH 24980 ax1 = s1.plot(logy=input_logy) diff --git a/requirements-dev.txt b/requirements-dev.txt index 8a79f911265a3..30189c35fcbb5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -24,7 +24,7 @@ gcsfs>=2022.05.0 ipython jinja2>=3.1.2 lxml>=4.8.0 -matplotlib>=3.6.1, <3.7.0 +matplotlib>=3.6.1 numba>=0.55.2 numexpr>=2.8.0 openpyxl<3.1.1, >=3.0.7 From 5c7b8eaa191bd0e7d91fa012fce134ea07feea62 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 23 Apr 2023 12:10:53 +0200 Subject: [PATCH 028/577] Adjust unxfail condition for nat test for new numpy release (#52867) --- pandas/compat/numpy/__init__.py | 1 + pandas/tests/scalar/test_nat.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index fbc04386bcef2..97c434d8f35d0 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -8,6 +8,7 @@ _nlv = Version(_np_version) np_version_under1p22 = _nlv < Version("1.22") np_version_gte1p24 = _nlv >= Version("1.24") +np_version_gte1p24p3 = _nlv >= Version("1.24.3") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.21.6" diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 0d85e37873a52..8296201345d2f 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -9,7 +9,7 @@ import pytz from pandas._libs.tslibs import iNaT -from pandas.compat import is_numpy_dev +from pandas.compat.numpy import np_version_gte1p24p3 from pandas import ( DatetimeIndex, @@ -526,7 +526,7 @@ def test_to_numpy_alias(): pytest.param( Timedelta(0).to_timedelta64(), marks=pytest.mark.xfail( - not is_numpy_dev, + not np_version_gte1p24p3, reason="td64 doesn't return NotImplemented, see numpy#17017", ), ), @@ -535,7 +535,7 @@ def test_to_numpy_alias(): pytest.param( Timestamp(0).to_datetime64(), marks=pytest.mark.xfail( - not is_numpy_dev, + not np_version_gte1p24p3, reason="dt64 doesn't return NotImplemented, see numpy#17017", ), ), From 089a94834a6130199ab13a028b822e9ef99a9880 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 23 Apr 2023 13:45:39 +0200 Subject: [PATCH 029/577] BUG: describe not respecting ArrowDtype in include/exclude (#52577) --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/dtypes/common.py | 4 ++++ pandas/core/frame.py | 2 ++ pandas/tests/frame/methods/test_describe.py | 20 ++++++++++++++++++++ 4 files changed, 27 insertions(+) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index cdc78dc60ea01..74a4974cef1da 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -38,6 +38,7 @@ Bug fixes - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) - Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) +- Bug in :meth:`DataFrame.describe` not respecting ``ArrowDtype`` in ``include`` and ``exclude`` (:issue:`52570`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e8f0ed1b80150..f27941331e7f9 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1500,6 +1500,10 @@ def infer_dtype_from_object(dtype) -> type: except TypeError: # Should still pass if we don't have a date-like pass + if hasattr(dtype, "numpy_dtype"): + # TODO: Implement this properly + # https://github.com/pandas-dev/pandas/issues/52576 + return dtype.numpy_dtype.type return dtype.type try: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bd298b8d723b8..31af2a8ff1304 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -132,6 +132,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.arrow import ArrowDtype from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -4718,6 +4719,7 @@ def check_int_infer_dtype(dtypes): def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: # GH 46870: BooleanDtype._is_numeric == True but should be excluded + dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype return issubclass(dtype.type, tuple(dtypes_set)) or ( np.number in dtypes_set and getattr(dtype, "_is_numeric", False) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index e2b8a0f63c31a..fbe6ff356499f 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -395,3 +395,23 @@ def test_ea_with_na(self, any_numeric_ea_dtype): dtype="Float64", ) tm.assert_frame_equal(result, expected) + + def test_describe_exclude_pa_dtype(self): + # GH#52570 + pa = pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())), + "b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())), + "c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())), + } + ) + result = df.describe( + include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32()) + ) + expected = DataFrame( + {"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + dtype=pd.ArrowDtype(pa.float64()), + ) + tm.assert_frame_equal(result, expected) From 332035b642121cf21cd210cfad564fd07a43812f Mon Sep 17 00:00:00 2001 From: Artem Vorobyev <44067777+SecretLake@users.noreply.github.com> Date: Sun, 23 Apr 2023 14:20:28 +0200 Subject: [PATCH 030/577] Test dir of pandas.api.* (#52826) --- pandas/tests/api/test_api.py | 96 ++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c2e89a2db12ee..73713de08473b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -5,7 +5,13 @@ import pandas as pd from pandas import api import pandas._testing as tm -from pandas.api import typing as api_typing +from pandas.api import ( + extensions as api_extensions, + indexers as api_indexers, + interchange as api_interchange, + types as api_types, + typing as api_typing, +) class Base: @@ -237,7 +243,13 @@ def test_depr(self): class TestApi(Base): - allowed = ["types", "extensions", "indexers", "interchange", "typing"] + allowed_api_dirs = [ + "types", + "extensions", + "indexers", + "interchange", + "typing", + ] allowed_typing = [ "DataFrameGroupBy", "DatetimeIndexResamplerGroupby", @@ -256,13 +268,91 @@ class TestApi(Base): "TimeGrouper", "Window", ] + allowed_api_types = [ + "is_any_real_numeric_dtype", + "is_array_like", + "is_bool", + "is_bool_dtype", + "is_categorical_dtype", + "is_complex", + "is_complex_dtype", + "is_datetime64_any_dtype", + "is_datetime64_dtype", + "is_datetime64_ns_dtype", + "is_datetime64tz_dtype", + "is_dict_like", + "is_dtype_equal", + "is_extension_array_dtype", + "is_file_like", + "is_float", + "is_float_dtype", + "is_hashable", + "is_int64_dtype", + "is_integer", + "is_integer_dtype", + "is_interval", + "is_interval_dtype", + "is_iterator", + "is_list_like", + "is_named_tuple", + "is_number", + "is_numeric_dtype", + "is_object_dtype", + "is_period_dtype", + "is_re", + "is_re_compilable", + "is_scalar", + "is_signed_integer_dtype", + "is_sparse", + "is_string_dtype", + "is_timedelta64_dtype", + "is_timedelta64_ns_dtype", + "is_unsigned_integer_dtype", + "pandas_dtype", + "infer_dtype", + "union_categoricals", + "CategoricalDtype", + "DatetimeTZDtype", + "IntervalDtype", + "PeriodDtype", + ] + allowed_api_interchange = ["from_dataframe", "DataFrame"] + allowed_api_indexers = [ + "check_array_indexer", + "BaseIndexer", + "FixedForwardWindowIndexer", + "VariableOffsetWindowIndexer", + ] + allowed_api_extensions = [ + "no_default", + "ExtensionDtype", + "register_extension_dtype", + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", + "take", + "ExtensionArray", + "ExtensionScalarOpsMixin", + ] def test_api(self): - self.check(api, self.allowed) + self.check(api, self.allowed_api_dirs) def test_api_typing(self): self.check(api_typing, self.allowed_typing) + def test_api_types(self): + self.check(api_types, self.allowed_api_types) + + def test_api_interchange(self): + self.check(api_interchange, self.allowed_api_interchange) + + def test_api_indexers(self): + self.check(api_indexers, self.allowed_api_indexers) + + def test_api_extensions(self): + self.check(api_extensions, self.allowed_api_extensions) + class TestTesting(Base): funcs = [ From 17bddb2d25517ac368f2fbea40e4d9839be6a047 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 23 Apr 2023 14:51:39 +0100 Subject: [PATCH 031/577] BUG: Adding a columns to a Frame with RangeIndex columns using a non-scalar key (#52877) --- doc/source/whatsnew/v2.0.1.rst | 2 ++ pandas/core/indexes/range.py | 2 ++ pandas/tests/indexes/test_indexing.py | 33 ++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 74a4974cef1da..b44cd5f9a1369 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) - Fixed regression in :meth:`SeriesGroupBy.agg` failing when grouping with categorical data, multiple groupings, ``as_index=False``, and a list of aggregations (:issue:`52760`) +- Fixed regression when adding a new column to a :class:`DataFrame` when the :attr:`DataFrame.columns` was a :class:`RangeIndex` and the new key was hashable but not a scalar (:issue:`52652`) .. --------------------------------------------------------------------------- .. _whatsnew_201.bug_fixes: @@ -57,6 +58,7 @@ Other - Bug in :class:`Index` where creating or converting to numpy string dtypes would raise ``NotImplementedError`` (:issue:`50127`) - Implemented :meth:`Series.str.split` and :meth:`Series.str.rsplit` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`52401`) - Implemented most ``str`` accessor methods for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`52401`) +- Supplying a non-integer hashable key that tests ``False`` in :func:`api.types.is_scalar` now raises a ``KeyError`` for :meth:`RangeIndex.get_loc`, like it does for :meth:`Index.get_loc`. Previously it raised an ``InvalidIndexError`` (:issue:`52652`). .. --------------------------------------------------------------------------- .. _whatsnew_201.contributors: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 10b3fa34da127..dd72ce30d290b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -347,6 +347,8 @@ def get_loc(self, key): return self._range.index(new_key) except ValueError as err: raise KeyError(key) from err + if isinstance(key, Hashable): + raise KeyError(key) self._check_indexing_error(key) raise KeyError(key) diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 52f09ac25873e..3bc55786e1d2f 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -19,7 +19,10 @@ from pandas.errors import InvalidIndexError -from pandas.core.dtypes.common import is_float_dtype +from pandas.core.dtypes.common import ( + is_float_dtype, + is_scalar, +) from pandas import ( NA, @@ -29,7 +32,6 @@ MultiIndex, NaT, PeriodIndex, - RangeIndex, TimedeltaIndex, ) import pandas._testing as tm @@ -179,6 +181,32 @@ def test_get_loc_non_hashable(self, index): with pytest.raises((TypeError, InvalidIndexError), match="slice"): index.get_loc(slice(0, 1)) + def test_get_loc_non_scalar_hashable(self, index): + # GH52877 + from enum import Enum + + class E(Enum): + X1 = "x1" + + assert not is_scalar(E.X1) + + exc = KeyError + msg = "" + if isinstance( + index, + ( + DatetimeIndex, + TimedeltaIndex, + PeriodIndex, + IntervalIndex, + ), + ): + # TODO: make these more consistent? + exc = InvalidIndexError + msg = "E.X1" + with pytest.raises(exc, match=msg): + index.get_loc(E.X1) + def test_get_loc_generator(self, index): exc = KeyError if isinstance( @@ -187,7 +215,6 @@ def test_get_loc_generator(self, index): DatetimeIndex, TimedeltaIndex, PeriodIndex, - RangeIndex, IntervalIndex, MultiIndex, ), From 16d9c39ca224d963523c8889d4a406386169d2bd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 23 Apr 2023 15:52:15 +0200 Subject: [PATCH 032/577] Add ruff cache to gitignore (#52873) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 88ed58b70925d..8b2d79b1f95f5 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ coverage.xml coverage_html_report .mypy_cache *.pytest_cache +.ruff_cache # hypothesis test database .hypothesis/ __pycache__ From 658ea54d8415eaf340b0e14183d70fc956e3aa7a Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 23 Apr 2023 15:11:12 -0400 Subject: [PATCH 033/577] TST/CLN: Remove groupby.test_allowlist (#52880) --- pandas/tests/groupby/test_allowlist.py | 124 ------------------------- pandas/tests/groupby/test_function.py | 50 ++++++++++ pandas/tests/groupby/test_groupby.py | 62 +++++++++++++ 3 files changed, 112 insertions(+), 124 deletions(-) delete mode 100644 pandas/tests/groupby/test_allowlist.py diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py deleted file mode 100644 index d495441593aed..0000000000000 --- a/pandas/tests/groupby/test_allowlist.py +++ /dev/null @@ -1,124 +0,0 @@ -""" -TODO: Existing tests should be moved or deduplicated -Do not add tests here! -""" - -import pytest - -from pandas import ( - DataFrame, - date_range, -) -import pandas._testing as tm - - -@pytest.mark.parametrize( - "op", - [ - "sum", - "prod", - "min", - "max", - "median", - "mean", - "skew", - "std", - "var", - "sem", - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -def test_regression_allowlist_methods(op, axis, skipna, sort): - # GH6944 - # GH 17537 - # explicitly test the allowlist methods - raw_frame = DataFrame([0]) - if axis == 0: - frame = raw_frame - msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be" - else: - frame = raw_frame.T - msg = "DataFrame.groupby with axis=1 is deprecated" - - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = frame.groupby(level=0, axis=axis, sort=sort) - - if op == "skew": - # skew has skipna - result = getattr(grouped, op)(skipna=skipna) - expected = frame.groupby(level=0).apply( - lambda h: getattr(h, op)(axis=axis, skipna=skipna) - ) - if sort: - expected = expected.sort_index(axis=axis) - tm.assert_frame_equal(result, expected) - else: - result = getattr(grouped, op)() - expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis)) - if sort: - expected = expected.sort_index(axis=axis) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "method", - [ - "count", - "corr", - "cummax", - "cummin", - "cumprod", - "describe", - "rank", - "quantile", - "diff", - "shift", - "all", - "any", - "idxmin", - "idxmax", - "ffill", - "bfill", - "pct_change", - ], -) -def test_groupby_selection_with_methods(df, method): - # some methods which require DatetimeIndex - rng = date_range("2014", periods=len(df)) - df.index = rng - - g = df.groupby(["A"])[["C"]] - g_exp = df[["C"]].groupby(df["A"]) - # TODO check groupby with > 1 col ? - - res = getattr(g, method)() - exp = getattr(g_exp, method)() - - # should always be frames! - tm.assert_frame_equal(res, exp) - - -def test_groupby_selection_other_methods(df): - # some methods which require DatetimeIndex - rng = date_range("2014", periods=len(df)) - df.columns.name = "foo" - df.index = rng - - g = df.groupby(["A"])[["C"]] - g_exp = df[["C"]].groupby(df["A"]) - - # methods which aren't just .foo() - tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) - msg = "DataFrameGroupBy.dtypes is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_frame_equal(g.dtypes, g_exp.dtypes) - tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum())) - - tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean()) - tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc()) - - tm.assert_frame_equal( - g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3) - ) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ac192f190962d..159c620e36cdd 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1662,3 +1662,53 @@ def test_duplicate_columns(request, groupby_func, as_index): if groupby_func not in ("size", "ngroup", "cumcount"): expected = expected.rename(columns={"c": "b"}) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op", + [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "std", + "var", + "sem", + ], +) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_regression_allowlist_methods(op, axis, skipna, sort): + # GH6944 + # GH 17537 + # explicitly test the allowlist methods + raw_frame = DataFrame([0]) + if axis == 0: + frame = raw_frame + msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be" + else: + frame = raw_frame.T + msg = "DataFrame.groupby with axis=1 is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = frame.groupby(level=0, axis=axis, sort=sort) + + if op == "skew": + # skew has skipna + result = getattr(grouped, op)(skipna=skipna) + expected = frame.groupby(level=0).apply( + lambda h: getattr(h, op)(axis=axis, skipna=skipna) + ) + if sort: + expected = expected.sort_index(axis=axis) + tm.assert_frame_equal(result, expected) + else: + result = getattr(grouped, op)() + expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis)) + if sort: + expected = expected.sort_index(axis=axis) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 29382be60b08b..53148eb37e15a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2981,3 +2981,65 @@ def test_groupby_sum_on_nan_should_return_nan(bug_var): expected_df = DataFrame([bug_var, bug_var, bug_var, np.nan], columns=["A"]) tm.assert_frame_equal(result, expected_df) + + +@pytest.mark.parametrize( + "method", + [ + "count", + "corr", + "cummax", + "cummin", + "cumprod", + "describe", + "rank", + "quantile", + "diff", + "shift", + "all", + "any", + "idxmin", + "idxmax", + "ffill", + "bfill", + "pct_change", + ], +) +def test_groupby_selection_with_methods(df, method): + # some methods which require DatetimeIndex + rng = date_range("2014", periods=len(df)) + df.index = rng + + g = df.groupby(["A"])[["C"]] + g_exp = df[["C"]].groupby(df["A"]) + # TODO check groupby with > 1 col ? + + res = getattr(g, method)() + exp = getattr(g_exp, method)() + + # should always be frames! + tm.assert_frame_equal(res, exp) + + +def test_groupby_selection_other_methods(df): + # some methods which require DatetimeIndex + rng = date_range("2014", periods=len(df)) + df.columns.name = "foo" + df.index = rng + + g = df.groupby(["A"])[["C"]] + g_exp = df[["C"]].groupby(df["A"]) + + # methods which aren't just .foo() + tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) + msg = "DataFrameGroupBy.dtypes is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_frame_equal(g.dtypes, g_exp.dtypes) + tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum())) + + tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean()) + tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc()) + + tm.assert_frame_equal( + g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3) + ) From b779f432996e75c6634a5feb65e356dd1ae90d23 Mon Sep 17 00:00:00 2001 From: AG <98327736+ggold7046@users.noreply.github.com> Date: Mon, 24 Apr 2023 00:43:43 +0530 Subject: [PATCH 034/577] DOC add example of DataFrame.index (#52835) --- ci/code_checks.sh | 1 - pandas/core/frame.py | 45 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c046d55d80b49..55618590071b5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -532,7 +532,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.api.extensions.ExtensionArray.ndim \ pandas.api.extensions.ExtensionArray.shape \ pandas.api.extensions.ExtensionArray.tolist \ - pandas.DataFrame.index \ pandas.DataFrame.columns \ pandas.DataFrame.__iter__ \ pandas.DataFrame.keys \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 31af2a8ff1304..051ebfff47f83 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11770,7 +11770,50 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: _info_axis_name: Literal["columns"] = "columns" index = properties.AxisProperty( - axis=1, doc="The index (row labels) of the DataFrame." + axis=1, + doc=""" + The index (row labels) of the DataFrame. + + The index of a DataFrame is a series of labels that identify each row. + The labels can be integers, strings, or any other hashable type. The index + is used for label-based access and alignment, and can be accessed or + modified using this attribute. + + Returns + ------- + pandas.Index + The index labels of the DataFrame. + + See Also + -------- + DataFrame.columns : The column labels of the DataFrame. + DataFrame.to_numpy : Convert the DataFrame to a NumPy array. + + Examples + -------- + >>> df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], + ... 'Age': [25, 30, 35], + ... 'Location': ['Seattle', 'New York', 'Kona']}, + ... index=([10, 20, 30])) + >>> df.index + Index([10, 20, 30], dtype='int64') + + In this example, we create a DataFrame with 3 rows and 3 columns, + including Name, Age, and Location information. We set the index labels to + be the integers 10, 20, and 30. We then access the `index` attribute of the + DataFrame, which returns an `Index` object containing the index labels. + + >>> df.index = [100, 200, 300] + >>> df + Name Age Location + 100 Alice 25 Seattle + 200 Bob 30 New York + 300 Aritra 35 Kona + + In this example, we modify the index labels of the DataFrame by assigning + a new list of labels to the `index` attribute. The DataFrame is then + updated with the new labels, and the output shows the modified DataFrame. + """, ) columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.") From 4c3cffd6cdb1ef0bc626e18ebaa791385733d07e Mon Sep 17 00:00:00 2001 From: Nirav <61644078+srkds@users.noreply.github.com> Date: Mon, 24 Apr 2023 00:48:22 +0530 Subject: [PATCH 035/577] TST: cast string to complex (#52855) --- pandas/tests/dtypes/test_dtypes.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 5b4e2ea7d9035..837977698be83 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1162,6 +1162,13 @@ def test_compare_complex_dtypes(): df.lt(df.astype(object)) +def test_cast_string_to_complex(): + # GH 4895 + expected = pd.DataFrame(["1.0+5j", "1.5-3j"], dtype=complex) + result = pd.DataFrame(["1.0+5j", "1.5-3j"]).astype(complex) + tm.assert_frame_equal(result, expected) + + def test_multi_column_dtype_assignment(): # GH #27583 df = pd.DataFrame({"a": [0.0], "b": 0.0}) From a26c2f1404eb31852fdbfcbb3667eb15eb29a0e6 Mon Sep 17 00:00:00 2001 From: Fabrizio Primerano Date: Sun, 23 Apr 2023 21:19:26 +0200 Subject: [PATCH 036/577] TST: Grouping with categorical interval columns (#52818) --- pandas/tests/groupby/test_groupby.py | 42 ++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 53148eb37e15a..cd33f031720e1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -16,6 +16,7 @@ DataFrame, Grouper, Index, + Interval, MultiIndex, RangeIndex, Series, @@ -2972,6 +2973,47 @@ def test_groupby_numeric_only_std_no_result(numeric_only): dfgb.std(numeric_only=numeric_only) +def test_grouping_with_categorical_interval_columns(): + # GH#34164 + df = DataFrame({"x": [0.1, 0.2, 0.3, -0.4, 0.5], "w": ["a", "b", "a", "c", "a"]}) + qq = pd.qcut(df["x"], q=np.linspace(0, 1, 5)) + result = df.groupby([qq, "w"], observed=False)["x"].agg("mean") + categorical_index_level_1 = Categorical( + [ + Interval(-0.401, 0.1, closed="right"), + Interval(0.1, 0.2, closed="right"), + Interval(0.2, 0.3, closed="right"), + Interval(0.3, 0.5, closed="right"), + ], + ordered=True, + ) + index_level_2 = ["a", "b", "c"] + mi = MultiIndex.from_product( + [categorical_index_level_1, index_level_2], names=["x", "w"] + ) + expected = Series( + np.array( + [ + 0.1, + np.nan, + -0.4, + np.nan, + 0.2, + np.nan, + 0.3, + np.nan, + np.nan, + 0.5, + np.nan, + np.nan, + ] + ), + index=mi, + name="x", + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("bug_var", [1, "a"]) def test_groupby_sum_on_nan_should_return_nan(bug_var): # GH 24196 From fe676653d8a26cac4df5ed4bb47d47393d7d808c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 23 Apr 2023 12:30:20 -0700 Subject: [PATCH 037/577] CI: Combine unit test workflows (#52694) --- .github/workflows/32-bit-linux.yml | 58 ----- .github/workflows/macos-windows.yml | 60 ----- .github/workflows/python-dev.yml | 95 -------- .github/workflows/ubuntu.yml | 168 -------------- .github/workflows/unit-tests.yml | 315 ++++++++++++++++++++++++++ pandas/tests/tools/test_to_numeric.py | 2 + 6 files changed, 317 insertions(+), 381 deletions(-) delete mode 100644 .github/workflows/32-bit-linux.yml delete mode 100644 .github/workflows/macos-windows.yml delete mode 100644 .github/workflows/python-dev.yml delete mode 100644 .github/workflows/ubuntu.yml create mode 100644 .github/workflows/unit-tests.yml diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml deleted file mode 100644 index 95d0d78c7585b..0000000000000 --- a/.github/workflows/32-bit-linux.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: 32 Bit Linux - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - -permissions: - contents: read - -jobs: - pytest: - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Run 32-bit manylinux2014 Docker Build / Tests - run: | - # Without this (line 34), versioneer will not be able to determine the pandas version. - # This is because of a security update to git that blocks it from reading the config folder if - # it is not owned by the current user. We hit this since the "mounted" folder is not hit by the - # Docker container. - # xref https://github.com/pypa/manylinux/issues/1309 - docker pull quay.io/pypa/manylinux2014_i686 - docker run --platform linux/386 -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ - /bin/bash -xc "cd pandas && \ - git config --global --add safe.directory /pandas && \ - /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ - . ~/virtualenvs/pandas-dev/bin/activate && \ - python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ - python -m pip install versioneer[toml] && \ - python -m pip install cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 && \ - python setup.py build_ext -q -j$(nproc) && \ - python -m pip install --no-build-isolation --no-use-pep517 -e . && \ - python -m pip list && \ - export PANDAS_CI=1 && \ - pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml" - - - name: Publish test results for Python 3.8-32 bit full Linux - uses: actions/upload-artifact@v3 - with: - name: Test results - path: test-data.xml - if: failure() - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit - cancel-in-progress: true diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml deleted file mode 100644 index 7ed5f5b90b959..0000000000000 --- a/.github/workflows/macos-windows.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Windows-macOS - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - - "web/**" - -env: - PANDAS_CI: 1 - PYTEST_TARGET: pandas - PATTERN: "not slow and not db and not network and not single_cpu" - -permissions: - contents: read - -jobs: - pytest: - defaults: - run: - shell: bash -el {0} - timeout-minutes: 180 - strategy: - matrix: - os: [macos-latest, windows-latest] - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] - fail-fast: false - runs-on: ${{ matrix.os }} - name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} - cancel-in-progress: true - env: - # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: ci/deps/${{ matrix.env_file }} - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - uses: ./.github/actions/run-tests diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml deleted file mode 100644 index 8ac8a1a1fcebf..0000000000000 --- a/.github/workflows/python-dev.yml +++ /dev/null @@ -1,95 +0,0 @@ -# This workflow may or may not run depending on the state of the next -# unreleased Python version. DO NOT DELETE IT. -# -# In general, this file will remain frozen(present, but not running) until: -# - The next unreleased Python version has released beta 1 -# - This version should be available on GitHub Actions. -# - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) -# support that unreleased Python version. -# To unfreeze, comment out the ``if: false`` condition, and make sure you update -# the name of the workflow and Python version in actions/setup-python to: '3.12-dev' -# -# After it has been unfrozen, this file should remain unfrozen(present, and running) until: -# - The next Python version has been officially released. -# OR -# - Most/All of our optional dependencies support Python 3.11 AND -# - The next Python version has released a rc(we are guaranteed a stable ABI). -# To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs -# to the corresponding posix/windows-macos/sdist etc. workflows. -# Feel free to modify this comment as necessary. - -name: Python Dev - -on: - push: - branches: - - main - - 2.0.x - - None - pull_request: - branches: - - main - - 2.0.x - - None - paths-ignore: - - "doc/**" - - "web/**" - -env: - PYTEST_WORKERS: "auto" - PANDAS_CI: 1 - PATTERN: "not slow and not network and not clipboard and not single_cpu" - COVERAGE: true - PYTEST_TARGET: pandas - -permissions: - contents: read - -jobs: - build: - if: false # Uncomment this to freeze the workflow, comment it to unfreeze - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] - - name: actions-311-dev - timeout-minutes: 120 - - concurrency: - #https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev - cancel-in-progress: true - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Python Dev Version - uses: actions/setup-python@v4 - with: - python-version: '3.11-dev' - - - name: Install dependencies - run: | - python --version - python -m pip install --upgrade pip setuptools wheel - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy - python -m pip install git+https://github.com/nedbat/coveragepy.git - python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 - python -m pip list - - - name: Build Pandas - run: | - python setup.py build_ext -q -j4 - python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index - - - name: Build Version - run: | - python -c "import pandas; pandas.show_versions();" - - - name: Test - uses: ./.github/actions/run-tests diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml deleted file mode 100644 index 7390c349ff565..0000000000000 --- a/.github/workflows/ubuntu.yml +++ /dev/null @@ -1,168 +0,0 @@ -name: Ubuntu - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - - "web/**" - -permissions: - contents: read - -jobs: - pytest: - runs-on: ubuntu-22.04 - defaults: - run: - shell: bash -el {0} - timeout-minutes: 180 - strategy: - matrix: - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] - # Prevent the include jobs from overriding other jobs - pattern: [""] - include: - - name: "Downstream Compat" - env_file: actions-38-downstream_compat.yaml - pattern: "not slow and not network and not single_cpu" - pytest_target: "pandas/tests/test_downstream.py" - - name: "Minimum Versions" - env_file: actions-38-minimum_versions.yaml - pattern: "not slow and not network and not single_cpu" - - name: "Locale: it_IT" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - extra_apt: "language-pack-it" - # Use the utf8 version as the default, it has no bad side-effect. - lang: "it_IT.utf8" - lc_all: "it_IT.utf8" - # Also install it_IT (its encoding is ISO8859-1) but do not activate it. - # It will be temporarily activated during tests with locale.setlocale - extra_loc: "it_IT" - - name: "Locale: zh_CN" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - extra_apt: "language-pack-zh-hans" - # Use the utf8 version as the default, it has no bad side-effect. - lang: "zh_CN.utf8" - lc_all: "zh_CN.utf8" - # Also install zh_CN (its encoding is gb2312) but do not activate it. - # It will be temporarily activated during tests with locale.setlocale - extra_loc: "zh_CN" - - name: "Copy-on-Write" - env_file: actions-310.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" - - name: "Pypy" - env_file: actions-pypy-38.yaml - pattern: "not slow and not network and not single_cpu" - test_args: "--max-worker-restart 0" - - name: "Numpy Dev" - env_file: actions-310-numpydev.yaml - pattern: "not slow and not network and not single_cpu" - test_args: "-W error::DeprecationWarning -W error::FutureWarning" - # TODO(cython3): Re-enable once next-beta(after beta 1) comes out - # There are some warnings failing the build with -werror - pandas_ci: "0" - - name: "Pyarrow Nightly" - env_file: actions-311-pyarrownightly.yaml - pattern: "not slow and not network and not single_cpu" - fail-fast: false - name: ${{ matrix.name || matrix.env_file }} - env: - ENV_FILE: ci/deps/${{ matrix.env_file }} - PATTERN: ${{ matrix.pattern }} - EXTRA_APT: ${{ matrix.extra_apt || '' }} - LANG: ${{ matrix.lang || '' }} - LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} - TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: 'auto' - PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} - cancel-in-progress: true - - services: - mysql: - image: mysql - env: - MYSQL_ALLOW_EMPTY_PASSWORD: yes - MYSQL_DATABASE: pandas - options: >- - --health-cmd "mysqladmin ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 3306:3306 - - postgres: - image: postgres - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: pandas - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - - moto: - image: motoserver/moto:4.1.4 - env: - AWS_ACCESS_KEY_ID: foobar_key - AWS_SECRET_ACCESS_KEY: foobar_secret - ports: - - 5000:5000 - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Extra installs - # xsel for clipboard tests - run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} - - - name: Generate extra locales - # These extra locales will be available for locale.setlocale() calls in tests - run: | - sudo locale-gen ${{ matrix.extra_loc }} - if: ${{ matrix.extra_loc }} - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: ${{ env.ENV_FILE }} - - - name: Build Pandas - id: build - uses: ./.github/actions/build_pandas - - - name: Test (not single_cpu) - uses: ./.github/actions/run-tests - if: ${{ matrix.name != 'Pypy' }} - env: - # Set pattern to not single_cpu if not already set - PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} - - - name: Test (single_cpu) - uses: ./.github/actions/run-tests - env: - PATTERN: 'single_cpu' - PYTEST_WORKERS: 1 - if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000000000..31e2095624347 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,315 @@ +name: Unit Tests + +on: + push: + branches: + - main + - 2.0.x + pull_request: + branches: + - main + - 2.0.x + paths-ignore: + - "doc/**" + - "web/**" + +permissions: + contents: read + +defaults: + run: + shell: bash -el {0} + +jobs: + ubuntu: + runs-on: ubuntu-22.04 + timeout-minutes: 180 + strategy: + matrix: + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] + # Prevent the include jobs from overriding other jobs + pattern: [""] + include: + - name: "Downstream Compat" + env_file: actions-38-downstream_compat.yaml + pattern: "not slow and not network and not single_cpu" + pytest_target: "pandas/tests/test_downstream.py" + - name: "Minimum Versions" + env_file: actions-38-minimum_versions.yaml + pattern: "not slow and not network and not single_cpu" + - name: "Locale: it_IT" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-it" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "it_IT.utf8" + lc_all: "it_IT.utf8" + # Also install it_IT (its encoding is ISO8859-1) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "it_IT" + - name: "Locale: zh_CN" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-zh-hans" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "zh_CN.utf8" + lc_all: "zh_CN.utf8" + # Also install zh_CN (its encoding is gb2312) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "zh_CN" + - name: "Copy-on-Write" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Pypy" + env_file: actions-pypy-38.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "--max-worker-restart 0" + - name: "Numpy Dev" + env_file: actions-310-numpydev.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" + # TODO(cython3): Re-enable once next-beta(after beta 1) comes out + # There are some warnings failing the build with -werror + pandas_ci: "0" + - name: "Pyarrow Nightly" + env_file: actions-311-pyarrownightly.yaml + pattern: "not slow and not network and not single_cpu" + fail-fast: false + name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + env: + ENV_FILE: ci/deps/${{ matrix.env_file }} + PATTERN: ${{ matrix.pattern }} + EXTRA_APT: ${{ matrix.extra_apt || '' }} + LANG: ${{ matrix.lang || '' }} + LC_ALL: ${{ matrix.lc_all || '' }} + PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} + PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + TEST_ARGS: ${{ matrix.test_args || '' }} + PYTEST_WORKERS: 'auto' + PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} + cancel-in-progress: true + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + moto: + image: motoserver/moto:4.1.4 + env: + AWS_ACCESS_KEY_ID: foobar_key + AWS_SECRET_ACCESS_KEY: foobar_secret + ports: + - 5000:5000 + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Extra installs + # xsel for clipboard tests + run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + + - name: Generate extra locales + # These extra locales will be available for locale.setlocale() calls in tests + run: | + sudo locale-gen ${{ matrix.extra_loc }} + if: ${{ matrix.extra_loc }} + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ${{ env.ENV_FILE }} + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + + - name: Test (not single_cpu) + uses: ./.github/actions/run-tests + if: ${{ matrix.name != 'Pypy' }} + env: + # Set pattern to not single_cpu if not already set + PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} + + - name: Test (single_cpu) + uses: ./.github/actions/run-tests + env: + PATTERN: 'single_cpu' + PYTEST_WORKERS: 1 + if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} + + macos-windows: + timeout-minutes: 180 + strategy: + matrix: + os: [macos-latest, windows-latest] + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] + fail-fast: false + runs-on: ${{ matrix.os }} + name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} + cancel-in-progress: true + env: + PANDAS_CI: 1 + PYTEST_TARGET: pandas + PATTERN: "not slow and not db and not network and not single_cpu" + # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors + PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ci/deps/${{ matrix.env_file }} + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Test + uses: ./.github/actions/run-tests + + Linux-32-bit: + runs-on: ubuntu-22.04 + container: + image: quay.io/pypa/manylinux2014_i686 + options: --platform linux/386 + steps: + - name: Checkout pandas Repo + # actions/checkout does not work since it requires node + run: | + git config --global --add safe.directory $PWD + + if [ $GITHUB_EVENT_NAME != pull_request ]; then + git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git reset --hard $GITHUB_SHA + else + git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git fetch origin $GITHUB_REF:my_ref_name + git checkout $GITHUB_BASE_REF + git -c user.email="you@example.com" merge --no-commit my_ref_name + fi + - name: Build environment and Run Tests + run: | + /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir --no-deps -U pip wheel setuptools + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python setup.py build_ext -q -j$(nproc) + python -m pip install --no-cache-dir --no-build-isolation --no-use-pep517 -e . + python -m pip list + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit + cancel-in-progress: true + + python-dev: + # This job may or may not run depending on the state of the next + # unreleased Python version. DO NOT DELETE IT. + # + # In general, this will remain frozen(present, but not running) until: + # - The next unreleased Python version has released beta 1 + # - This version should be available on GitHub Actions. + # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) + # support that unreleased Python version. + # To unfreeze, comment out the ``if: false`` condition, and make sure you update + # the name of the workflow and Python version in actions/setup-python ``python-version:`` + # + # After it has been unfrozen, this file should remain unfrozen(present, and running) until: + # - The next Python version has been officially released. + # OR + # - Most/All of our optional dependencies support the next Python version AND + # - The next Python version has released a rc(we are guaranteed a stable ABI). + # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs + # to the corresponding posix/windows-macos/sdist etc. workflows. + # Feel free to modify this comment as necessary. + if: false # Uncomment this to freeze the workflow, comment it to unfreeze + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, macOS-latest, windows-latest] + + timeout-minutes: 180 + + concurrency: + #https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + COVERAGE: true + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python Dev Version + uses: actions/setup-python@v4 + with: + python-version: '3.11-dev' + + - name: Install dependencies + run: | + python --version + python -m pip install --upgrade pip setuptools wheel + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy + python -m pip install git+https://github.com/nedbat/coveragepy.git + python -m pip install versioneer[toml] + python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip list + + - name: Build Pandas + run: | + python setup.py build_ext -q -j4 + python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index + + - name: Build Version + run: | + python -c "import pandas; pandas.show_versions();" + + - name: Test + uses: ./.github/actions/run-tests diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 499bcae5e90f0..1d969e648b752 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -510,6 +510,8 @@ def test_ignore_downcast_neg_to_unsigned(): tm.assert_numpy_array_equal(res, expected) +# Warning in 32 bit platforms +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"]) @pytest.mark.parametrize( "data,expected", From 2708c9edf83071ec9aa976930a38260e3508806f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 24 Apr 2023 00:50:11 +0200 Subject: [PATCH 038/577] DOC: Update whatsnew (#52882) --- doc/source/whatsnew/v2.0.1.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index b44cd5f9a1369..4936d7cae1d91 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_201: -What's new in 2.0.1 (May XX, 2023) ----------------------------------- +What's new in 2.0.1 (April 24, 2023) +------------------------------------ These are the changes in pandas 2.0.1. See :ref:`release` for a full changelog including other versions of pandas. @@ -14,12 +14,12 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression for subclassed Series when constructing from a dictionary (:issue:`52445`) +- Fixed regression in :meth:`.SeriesGroupBy.agg` failing when grouping with categorical data, multiple groupings, ``as_index=False``, and a list of aggregations (:issue:`52760`) - Fixed regression in :meth:`DataFrame.pivot` changing :class:`Index` name of input object (:issue:`52629`) - Fixed regression in :meth:`DataFrame.resample` raising on a DataFrame with no columns (:issue:`52484`) - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) -- Fixed regression in :meth:`SeriesGroupBy.agg` failing when grouping with categorical data, multiple groupings, ``as_index=False``, and a list of aggregations (:issue:`52760`) - Fixed regression when adding a new column to a :class:`DataFrame` when the :attr:`DataFrame.columns` was a :class:`RangeIndex` and the new key was hashable but not a scalar (:issue:`52652`) .. --------------------------------------------------------------------------- From f42990f5c91c74c29c506efc66130c81007dd5b6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 24 Apr 2023 14:44:10 +0200 Subject: [PATCH 039/577] Install pre-commit automatically in gitpod (#52856) Install pre-commit (cherry picked from commit 5408f696f59a83f5bf218d619702f75fe954864d) --- .gitpod.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitpod.yml b/.gitpod.yml index 8b086a589a378..0a5b5648994ae 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -15,6 +15,7 @@ tasks: git fetch --tags python setup.py build_ext --inplace -j 4 echo "🛠 Completed rebuilding Pandas!! 🛠 " + pre-commit install echo "✨ Pre-build complete! You can close this terminal ✨ " # -------------------------------------------------------- From d88ce9067cf743dd39c227e2ef5b1e9b8d8d75cd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 24 Apr 2023 16:36:06 +0200 Subject: [PATCH 040/577] DOC: Add whatsnew for 2.0.2 (#52893) --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.0.2.rst | 38 ++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 doc/source/whatsnew/v2.0.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 555868d001118..42f110dfd7519 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 2.0 .. toctree:: :maxdepth: 2 + v2.0.2 v2.0.1 v2.0.0 diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst new file mode 100644 index 0000000000000..0a6738cb9b3dc --- /dev/null +++ b/doc/source/whatsnew/v2.0.2.rst @@ -0,0 +1,38 @@ +.. _whatsnew_202: + +What's new in 2.0.2 (May ..., 2023) +----------------------------------- + +These are the changes in pandas 2.0.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_202.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_202.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_202.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_202.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.0.1..v2.0.2|HEAD From 063be94a2ff649ba8546d7e5a79c4e3ee633a3f6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 24 Apr 2023 19:36:24 +0200 Subject: [PATCH 041/577] Fix typo in future warning message (#52889) --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a370300100866..50b39ee977ed4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4349,7 +4349,7 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde _apply_groupings_depr = ( "{}.apply operated on the grouping columns. This behavior is deprecated, " "and in a future version of pandas the grouping columns will be excluded " - "from the operation. Select the columns to operate on after groupby to" + "from the operation. Select the columns to operate on after groupby to " "either explicitly include or exclude the groupings and silence " "this warning." ) From f2d26ae379cbed2e0444c3891b9ac48a494a91e8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Apr 2023 11:24:31 -0700 Subject: [PATCH 042/577] Bump pypa/cibuildwheel from 2.12.1 to 2.12.3 (#52891) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.12.1 to 2.12.3. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.12.1...v2.12.3) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 593a98d64e46a..a42957c1cc942 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -71,7 +71,7 @@ jobs: fetch-depth: 0 - name: Build wheels - uses: pypa/cibuildwheel@v2.12.1 + uses: pypa/cibuildwheel@v2.12.3 env: CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} From f87c11f355736aff0d2039335aa7da13c7508a65 Mon Sep 17 00:00:00 2001 From: JHM Darbyshire <24256554+attack68@users.noreply.github.com> Date: Mon, 24 Apr 2023 20:32:42 +0200 Subject: [PATCH 043/577] CLN: remove redundant LatexFormatter and tests (#52844) Co-authored-by: JHM Darbyshire (iMac) --- pandas/io/formats/format.py | 32 - pandas/io/formats/latex.py | 831 ----------------------- pandas/tests/io/formats/test_to_latex.py | 111 +-- 3 files changed, 10 insertions(+), 964 deletions(-) delete mode 100644 pandas/io/formats/latex.py diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ae67b05047a98..24a396eb9491e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1022,38 +1022,6 @@ class DataFrameRenderer: def __init__(self, fmt: DataFrameFormatter) -> None: self.fmt = fmt - def to_latex( - self, - buf: FilePath | WriteBuffer[str] | None = None, - column_format: str | None = None, - longtable: bool = False, - encoding: str | None = None, - multicolumn: bool = False, - multicolumn_format: str | None = None, - multirow: bool = False, - caption: str | tuple[str, str] | None = None, - label: str | None = None, - position: str | None = None, - ) -> str | None: - """ - Render a DataFrame to a LaTeX tabular/longtable environment output. - """ - from pandas.io.formats.latex import LatexFormatter - - latex_formatter = LatexFormatter( - self.fmt, - longtable=longtable, - column_format=column_format, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow, - caption=caption, - label=label, - position=position, - ) - string = latex_formatter.to_string() - return save_to_buffer(string, buf=buf, encoding=encoding) - def to_html( self, buf: FilePath | WriteBuffer[str] | None = None, diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py deleted file mode 100644 index a97f3d4ef541e..0000000000000 --- a/pandas/io/formats/latex.py +++ /dev/null @@ -1,831 +0,0 @@ -""" -Module for formatting output data in Latex. -""" -from __future__ import annotations - -from abc import ( - ABC, - abstractmethod, -) -from typing import ( - TYPE_CHECKING, - Iterator, - Sequence, -) - -import numpy as np - -from pandas.core.dtypes.generic import ABCMultiIndex - -if TYPE_CHECKING: - from pandas.io.formats.format import DataFrameFormatter - - -def _split_into_full_short_caption( - caption: str | tuple[str, str] | None -) -> tuple[str, str]: - """Extract full and short captions from caption string/tuple. - - Parameters - ---------- - caption : str or tuple, optional - Either table caption string or tuple (full_caption, short_caption). - If string is provided, then it is treated as table full caption, - while short_caption is considered an empty string. - - Returns - ------- - full_caption, short_caption : tuple - Tuple of full_caption, short_caption strings. - """ - if caption: - if isinstance(caption, str): - full_caption = caption - short_caption = "" - else: - try: - full_caption, short_caption = caption - except ValueError as err: - msg = "caption must be either a string or a tuple of two strings" - raise ValueError(msg) from err - else: - full_caption = "" - short_caption = "" - return full_caption, short_caption - - -class RowStringConverter: - r"""Converter for dataframe rows into LaTeX strings. - - Parameters - ---------- - formatter : `DataFrameFormatter` - Instance of `DataFrameFormatter`. - multicolumn: bool, optional - Whether to use \multicolumn macro. - multicolumn_format: str, optional - Multicolumn format. - multirow: bool, optional - Whether to use \multirow macro. - - """ - - def __init__( - self, - formatter: DataFrameFormatter, - multicolumn: bool = False, - multicolumn_format: str | None = None, - multirow: bool = False, - ) -> None: - self.fmt = formatter - self.frame = self.fmt.frame - self.multicolumn = multicolumn - self.multicolumn_format = multicolumn_format - self.multirow = multirow - self.clinebuf: list[list[int]] = [] - self.strcols = self._get_strcols() - self.strrows = list(zip(*self.strcols)) - - def get_strrow(self, row_num: int) -> str: - """Get string representation of the row.""" - row = self.strrows[row_num] - - is_multicol = ( - row_num < self.column_levels and self.fmt.header and self.multicolumn - ) - - is_multirow = ( - row_num >= self.header_levels - and self.fmt.index - and self.multirow - and self.index_levels > 1 - ) - - is_cline_maybe_required = is_multirow and row_num < len(self.strrows) - 1 - - crow = self._preprocess_row(row) - - if is_multicol: - crow = self._format_multicolumn(crow) - if is_multirow: - crow = self._format_multirow(crow, row_num) - - lst = [] - lst.append(" & ".join(crow)) - lst.append(" \\\\") - if is_cline_maybe_required: - cline = self._compose_cline(row_num, len(self.strcols)) - lst.append(cline) - return "".join(lst) - - @property - def _header_row_num(self) -> int: - """Number of rows in header.""" - return self.header_levels if self.fmt.header else 0 - - @property - def index_levels(self) -> int: - """Integer number of levels in index.""" - return self.frame.index.nlevels - - @property - def column_levels(self) -> int: - return self.frame.columns.nlevels - - @property - def header_levels(self) -> int: - nlevels = self.column_levels - if self.fmt.has_index_names and self.fmt.show_index_names: - nlevels += 1 - return nlevels - - def _get_strcols(self) -> list[list[str]]: - """String representation of the columns.""" - if self.fmt.frame.empty: - strcols = [[self._empty_info_line]] - else: - strcols = self.fmt.get_strcols() - - # reestablish the MultiIndex that has been joined by get_strcols() - if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): - out = self.frame.index.format( - adjoin=False, - sparsify=self.fmt.sparsify, - names=self.fmt.has_index_names, - na_rep=self.fmt.na_rep, - ) - - # index.format will sparsify repeated entries with empty strings - # so pad these with some empty space - def pad_empties(x): - for pad in reversed(x): - if pad: - return [x[0]] + [i if i else " " * len(pad) for i in x[1:]] - - gen = (pad_empties(i) for i in out) - - # Add empty spaces for each column level - clevels = self.frame.columns.nlevels - out = [[" " * len(i[-1])] * clevels + i for i in gen] - - # Add the column names to the last index column - cnames = self.frame.columns.names - if any(cnames): - new_names = [i if i else "{}" for i in cnames] - out[self.frame.index.nlevels - 1][:clevels] = new_names - - # Get rid of old multiindex column and add new ones - strcols = out + strcols[1:] - return strcols - - @property - def _empty_info_line(self) -> str: - return ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {self.frame.columns}\n" - f"Index: {self.frame.index}" - ) - - def _preprocess_row(self, row: Sequence[str]) -> list[str]: - """Preprocess elements of the row.""" - if self.fmt.escape: - crow = _escape_symbols(row) - else: - crow = [x if x else "{}" for x in row] - if self.fmt.bold_rows and self.fmt.index: - crow = _convert_to_bold(crow, self.index_levels) - return crow - - def _format_multicolumn(self, row: list[str]) -> list[str]: - r""" - Combine columns belonging to a group to a single multicolumn entry - according to self.multicolumn_format - - e.g.: - a & & & b & c & - will become - \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} - """ - row2 = row[: self.index_levels] - ncol = 1 - coltext = "" - - def append_col() -> None: - # write multicolumn if needed - if ncol > 1: - row2.append( - f"\\multicolumn{{{ncol:d}}}{{{self.multicolumn_format}}}" - f"{{{coltext.strip()}}}" - ) - # don't modify where not needed - else: - row2.append(coltext) - - for c in row[self.index_levels :]: - # if next col has text, write the previous - if c.strip(): - if coltext: - append_col() - coltext = c - ncol = 1 - # if not, add it to the previous multicolumn - else: - ncol += 1 - # write last column name - if coltext: - append_col() - return row2 - - def _format_multirow(self, row: list[str], i: int) -> list[str]: - r""" - Check following rows, whether row should be a multirow - - e.g.: becomes: - a & 0 & \multirow{2}{*}{a} & 0 & - & 1 & & 1 & - b & 0 & \cline{1-2} - b & 0 & - """ - for j in range(self.index_levels): - if row[j].strip(): - nrow = 1 - for r in self.strrows[i + 1 :]: - if not r[j].strip(): - nrow += 1 - else: - break - if nrow > 1: - # overwrite non-multirow entry - row[j] = f"\\multirow{{{nrow:d}}}{{*}}{{{row[j].strip()}}}" - # save when to end the current block with \cline - self.clinebuf.append([i + nrow - 1, j + 1]) - return row - - def _compose_cline(self, i: int, icol: int) -> str: - """ - Create clines after multirow-blocks are finished. - """ - lst = [] - for cl in self.clinebuf: - if cl[0] == i: - lst.append(f"\n\\cline{{{cl[1]:d}-{icol:d}}}") - # remove entries that have been written to buffer - self.clinebuf = [x for x in self.clinebuf if x[0] != i] - return "".join(lst) - - -class RowStringIterator(RowStringConverter): - """Iterator over rows of the header or the body of the table.""" - - @abstractmethod - def __iter__(self) -> Iterator[str]: - """Iterate over LaTeX string representations of rows.""" - - -class RowHeaderIterator(RowStringIterator): - """Iterator for the table header rows.""" - - def __iter__(self) -> Iterator[str]: - for row_num in range(len(self.strrows)): - if row_num < self._header_row_num: - yield self.get_strrow(row_num) - - -class RowBodyIterator(RowStringIterator): - """Iterator for the table body rows.""" - - def __iter__(self) -> Iterator[str]: - for row_num in range(len(self.strrows)): - if row_num >= self._header_row_num: - yield self.get_strrow(row_num) - - -class TableBuilderAbstract(ABC): - """ - Abstract table builder producing string representation of LaTeX table. - - Parameters - ---------- - formatter : `DataFrameFormatter` - Instance of `DataFrameFormatter`. - column_format: str, optional - Column format, for example, 'rcl' for three columns. - multicolumn: bool, optional - Use multicolumn to enhance MultiIndex columns. - multicolumn_format: str, optional - The alignment for multicolumns, similar to column_format. - multirow: bool, optional - Use multirow to enhance MultiIndex rows. - caption: str, optional - Table caption. - short_caption: str, optional - Table short caption. - label: str, optional - LaTeX label. - position: str, optional - Float placement specifier, for example, 'htb'. - """ - - def __init__( - self, - formatter: DataFrameFormatter, - column_format: str | None = None, - multicolumn: bool = False, - multicolumn_format: str | None = None, - multirow: bool = False, - caption: str | None = None, - short_caption: str | None = None, - label: str | None = None, - position: str | None = None, - ) -> None: - self.fmt = formatter - self.column_format = column_format - self.multicolumn = multicolumn - self.multicolumn_format = multicolumn_format - self.multirow = multirow - self.caption = caption - self.short_caption = short_caption - self.label = label - self.position = position - - def get_result(self) -> str: - """String representation of LaTeX table.""" - elements = [ - self.env_begin, - self.top_separator, - self.header, - self.middle_separator, - self.env_body, - self.bottom_separator, - self.env_end, - ] - result = "\n".join([item for item in elements if item]) - trailing_newline = "\n" - result += trailing_newline - return result - - @property - @abstractmethod - def env_begin(self) -> str: - """Beginning of the environment.""" - - @property - @abstractmethod - def top_separator(self) -> str: - """Top level separator.""" - - @property - @abstractmethod - def header(self) -> str: - """Header lines.""" - - @property - @abstractmethod - def middle_separator(self) -> str: - """Middle level separator.""" - - @property - @abstractmethod - def env_body(self) -> str: - """Environment body.""" - - @property - @abstractmethod - def bottom_separator(self) -> str: - """Bottom level separator.""" - - @property - @abstractmethod - def env_end(self) -> str: - """End of the environment.""" - - -class GenericTableBuilder(TableBuilderAbstract): - """Table builder producing string representation of LaTeX table.""" - - @property - def header(self) -> str: - iterator = self._create_row_iterator(over="header") - return "\n".join(list(iterator)) - - @property - def top_separator(self) -> str: - return "\\toprule" - - @property - def middle_separator(self) -> str: - return "\\midrule" if self._is_separator_required() else "" - - @property - def env_body(self) -> str: - iterator = self._create_row_iterator(over="body") - return "\n".join(list(iterator)) - - def _is_separator_required(self) -> bool: - return bool(self.header and self.env_body) - - @property - def _position_macro(self) -> str: - r"""Position macro, extracted from self.position, like [h].""" - return f"[{self.position}]" if self.position else "" - - @property - def _caption_macro(self) -> str: - r"""Caption macro, extracted from self.caption. - - With short caption: - \caption[short_caption]{caption_string}. - - Without short caption: - \caption{caption_string}. - """ - if self.caption: - return "".join( - [ - r"\caption", - f"[{self.short_caption}]" if self.short_caption else "", - f"{{{self.caption}}}", - ] - ) - return "" - - @property - def _label_macro(self) -> str: - r"""Label macro, extracted from self.label, like \label{ref}.""" - return f"\\label{{{self.label}}}" if self.label else "" - - def _create_row_iterator(self, over: str) -> RowStringIterator: - """Create iterator over header or body of the table. - - Parameters - ---------- - over : {'body', 'header'} - Over what to iterate. - - Returns - ------- - RowStringIterator - Iterator over body or header. - """ - iterator_kind = self._select_iterator(over) - return iterator_kind( - formatter=self.fmt, - multicolumn=self.multicolumn, - multicolumn_format=self.multicolumn_format, - multirow=self.multirow, - ) - - def _select_iterator(self, over: str) -> type[RowStringIterator]: - """Select proper iterator over table rows.""" - if over == "header": - return RowHeaderIterator - elif over == "body": - return RowBodyIterator - else: - msg = f"'over' must be either 'header' or 'body', but {over} was provided" - raise ValueError(msg) - - -class LongTableBuilder(GenericTableBuilder): - """Concrete table builder for longtable. - - >>> from pandas.io.formats import format as fmt - >>> df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - >>> formatter = fmt.DataFrameFormatter(df) - >>> builder = LongTableBuilder(formatter, caption='a long table', - ... label='tab:long', column_format='lrl') - >>> table = builder.get_result() - >>> print(table) - \\begin{longtable}{lrl} - \\caption{a long table} - \\label{tab:long}\\\\ - \\toprule - {} & a & b \\\\ - \\midrule - \\endfirsthead - \\caption[]{a long table} \\\\ - \\toprule - {} & a & b \\\\ - \\midrule - \\endhead - \\midrule - \\multicolumn{3}{r}{{Continued on next page}} \\\\ - \\midrule - \\endfoot - - \\bottomrule - \\endlastfoot - 0 & 1 & b1 \\\\ - 1 & 2 & b2 \\\\ - \\end{longtable} - - """ - - @property - def env_begin(self) -> str: - first_row = ( - f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" - ) - elements = [first_row, f"{self._caption_and_label()}"] - return "\n".join([item for item in elements if item]) - - def _caption_and_label(self) -> str: - if self.caption or self.label: - double_backslash = "\\\\" - elements = [f"{self._caption_macro}", f"{self._label_macro}"] - caption_and_label = "\n".join([item for item in elements if item]) - caption_and_label += double_backslash - return caption_and_label - else: - return "" - - @property - def middle_separator(self) -> str: - iterator = self._create_row_iterator(over="header") - - # the content between \endfirsthead and \endhead commands - # mitigates repeated List of Tables entries in the final LaTeX - # document when dealing with longtable environments; GH #34360 - elements = [ - "\\midrule", - "\\endfirsthead", - f"\\caption[]{{{self.caption}}} \\\\" if self.caption else "", - self.top_separator, - self.header, - "\\midrule", - "\\endhead", - "\\midrule", - f"\\multicolumn{{{len(iterator.strcols)}}}{{r}}" - "{{Continued on next page}} \\\\", - "\\midrule", - "\\endfoot\n", - "\\bottomrule", - "\\endlastfoot", - ] - if self._is_separator_required(): - return "\n".join(elements) - return "" - - @property - def bottom_separator(self) -> str: - return "" - - @property - def env_end(self) -> str: - return "\\end{longtable}" - - -class RegularTableBuilder(GenericTableBuilder): - """Concrete table builder for regular table. - - >>> from pandas.io.formats import format as fmt - >>> df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - >>> formatter = fmt.DataFrameFormatter(df) - >>> builder = RegularTableBuilder(formatter, caption='caption', label='lab', - ... column_format='lrc') - >>> table = builder.get_result() - >>> print(table) - \\begin{table} - \\centering - \\caption{caption} - \\label{lab} - \\begin{tabular}{lrc} - \\toprule - {} & a & b \\\\ - \\midrule - 0 & 1 & b1 \\\\ - 1 & 2 & b2 \\\\ - \\bottomrule - \\end{tabular} - \\end{table} - - """ - - @property - def env_begin(self) -> str: - elements = [ - f"\\begin{{table}}{self._position_macro}", - "\\centering", - f"{self._caption_macro}", - f"{self._label_macro}", - f"\\begin{{tabular}}{{{self.column_format}}}", - ] - return "\n".join([item for item in elements if item]) - - @property - def bottom_separator(self) -> str: - return "\\bottomrule" - - @property - def env_end(self) -> str: - return "\n".join(["\\end{tabular}", "\\end{table}"]) - - -class TabularBuilder(GenericTableBuilder): - """Concrete table builder for tabular environment. - - >>> from pandas.io.formats import format as fmt - >>> df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - >>> formatter = fmt.DataFrameFormatter(df) - >>> builder = TabularBuilder(formatter, column_format='lrc') - >>> table = builder.get_result() - >>> print(table) - \\begin{tabular}{lrc} - \\toprule - {} & a & b \\\\ - \\midrule - 0 & 1 & b1 \\\\ - 1 & 2 & b2 \\\\ - \\bottomrule - \\end{tabular} - - """ - - @property - def env_begin(self) -> str: - return f"\\begin{{tabular}}{{{self.column_format}}}" - - @property - def bottom_separator(self) -> str: - return "\\bottomrule" - - @property - def env_end(self) -> str: - return "\\end{tabular}" - - -class LatexFormatter: - r""" - Used to render a DataFrame to a LaTeX tabular/longtable environment output. - - Parameters - ---------- - formatter : `DataFrameFormatter` - longtable : bool, default False - Use longtable environment. - column_format : str, default None - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 columns - multicolumn : bool, default False - Use \multicolumn to enhance MultiIndex columns. - multicolumn_format : str, default 'l' - The alignment for multicolumns, similar to `column_format` - multirow : bool, default False - Use \multirow to enhance MultiIndex rows. - caption : str or tuple, optional - Tuple (full_caption, short_caption), - which results in \caption[short_caption]{full_caption}; - if a single string is passed, no short caption will be set. - label : str, optional - The LaTeX label to be placed inside ``\label{}`` in the output. - position : str, optional - The LaTeX positional argument for tables, to be placed after - ``\begin{}`` in the output. - - See Also - -------- - HTMLFormatter - """ - - def __init__( - self, - formatter: DataFrameFormatter, - longtable: bool = False, - column_format: str | None = None, - multicolumn: bool = False, - multicolumn_format: str | None = None, - multirow: bool = False, - caption: str | tuple[str, str] | None = None, - label: str | None = None, - position: str | None = None, - ) -> None: - self.fmt = formatter - self.frame = self.fmt.frame - self.longtable = longtable - self.column_format = column_format - self.multicolumn = multicolumn - self.multicolumn_format = multicolumn_format - self.multirow = multirow - self.caption, self.short_caption = _split_into_full_short_caption(caption) - self.label = label - self.position = position - - def to_string(self) -> str: - """ - Render a DataFrame to a LaTeX tabular, longtable, or table/tabular - environment output. - """ - return self.builder.get_result() - - @property - def builder(self) -> TableBuilderAbstract: - """Concrete table builder. - - Returns - ------- - TableBuilder - """ - builder = self._select_builder() - return builder( - formatter=self.fmt, - column_format=self.column_format, - multicolumn=self.multicolumn, - multicolumn_format=self.multicolumn_format, - multirow=self.multirow, - caption=self.caption, - short_caption=self.short_caption, - label=self.label, - position=self.position, - ) - - def _select_builder(self) -> type[TableBuilderAbstract]: - """Select proper table builder.""" - if self.longtable: - return LongTableBuilder - if any([self.caption, self.label, self.position]): - return RegularTableBuilder - return TabularBuilder - - @property - def column_format(self) -> str | None: - """Column format.""" - return self._column_format - - @column_format.setter - def column_format(self, input_column_format: str | None) -> None: - """Setter for column format.""" - if input_column_format is None: - self._column_format = ( - self._get_index_format() + self._get_column_format_based_on_dtypes() - ) - elif not isinstance(input_column_format, str): - raise ValueError( - f"column_format must be str or unicode, " - f"not {type(input_column_format)}" - ) - else: - self._column_format = input_column_format - - def _get_column_format_based_on_dtypes(self) -> str: - """Get column format based on data type. - - Right alignment for numbers and left - for strings. - """ - - def get_col_type(dtype) -> str: - if issubclass(dtype.type, np.number): - return "r" - return "l" - - dtypes = self.frame.dtypes._values - return "".join(map(get_col_type, dtypes)) - - def _get_index_format(self) -> str: - """Get index column format.""" - return "l" * self.frame.index.nlevels if self.fmt.index else "" - - -def _escape_symbols(row: Sequence[str]) -> list[str]: - """Carry out string replacements for special symbols. - - Parameters - ---------- - row : list - List of string, that may contain special symbols. - - Returns - ------- - list - list of strings with the special symbols replaced. - """ - return [ - ( - x.replace("\\", "\\textbackslash ") - .replace("_", "\\_") - .replace("%", "\\%") - .replace("$", "\\$") - .replace("#", "\\#") - .replace("{", "\\{") - .replace("}", "\\}") - .replace("~", "\\textasciitilde ") - .replace("^", "\\textasciicircum ") - .replace("&", "\\&") - if (x and x != "{}") - else "{}" - ) - for x in row - ] - - -def _convert_to_bold(crow: Sequence[str], ilevels: int) -> list[str]: - """Convert elements in ``crow`` to bold.""" - return [ - f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x - for j, x in enumerate(crow) - ] - - -if __name__ == "__main__": - import doctest - - doctest.testmod() diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index c0bbc22fc1746..64c064172a646 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -11,14 +11,6 @@ ) import pandas._testing as tm -from pandas.io.formats.format import DataFrameFormatter -from pandas.io.formats.latex import ( - RegularTableBuilder, - RowBodyIterator, - RowHeaderIterator, - RowStringConverter, -) - pytest.importorskip("jinja2") @@ -1417,97 +1409,14 @@ def test_to_latex_multiindex_multirow(self): assert result == expected -class TestTableBuilder: - @pytest.fixture - def dataframe(self): - return DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - - @pytest.fixture - def table_builder(self, dataframe): - return RegularTableBuilder(formatter=DataFrameFormatter(dataframe)) - - def test_create_row_iterator(self, table_builder): - iterator = table_builder._create_row_iterator(over="header") - assert isinstance(iterator, RowHeaderIterator) - - def test_create_body_iterator(self, table_builder): - iterator = table_builder._create_row_iterator(over="body") - assert isinstance(iterator, RowBodyIterator) - - def test_create_body_wrong_kwarg_raises(self, table_builder): - with pytest.raises(ValueError, match="must be either 'header' or 'body'"): - table_builder._create_row_iterator(over="SOMETHING BAD") - - -class TestRowStringConverter: - @pytest.mark.parametrize( - "row_num, expected", - [ - (0, r"{} & Design & ratio & xy \\"), - (1, r"0 & 1 & 4 & 10 \\"), - (2, r"1 & 2 & 5 & 11 \\"), - ], - ) - def test_get_strrow_normal_without_escape(self, row_num, expected): - df = DataFrame({r"Design": [1, 2, 3], r"ratio": [4, 5, 6], r"xy": [10, 11, 12]}) - row_string_converter = RowStringConverter( - formatter=DataFrameFormatter(df, escape=True), - ) - assert row_string_converter.get_strrow(row_num=row_num) == expected - - @pytest.mark.parametrize( - "row_num, expected", - [ - (0, r"{} & Design \# & ratio, \% & x\&y \\"), - (1, r"0 & 1 & 4 & 10 \\"), - (2, r"1 & 2 & 5 & 11 \\"), - ], - ) - def test_get_strrow_normal_with_escape(self, row_num, expected): - df = DataFrame( - {r"Design #": [1, 2, 3], r"ratio, %": [4, 5, 6], r"x&y": [10, 11, 12]} - ) - row_string_converter = RowStringConverter( - formatter=DataFrameFormatter(df, escape=True), - ) - assert row_string_converter.get_strrow(row_num=row_num) == expected - - @pytest.mark.parametrize( - "row_num, expected", - [ - (0, r"{} & \multicolumn{2}{r}{c1} & \multicolumn{2}{r}{c2} & c3 \\"), - (1, r"{} & 0 & 1 & 0 & 1 & 0 \\"), - (2, r"0 & 0 & 5 & 0 & 5 & 0 \\"), - ], +def test_to_latex_exceeding_float_point_double(): + df = DataFrame(data=[[1234567890123456789]], columns=["test"]) + expected = _dedent( + r""" + \begin{tabular}{lr} + & test \\ + 0 & 1234567890123456789 \\ + \end{tabular} + """ ) - def test_get_strrow_multindex_multicolumn(self, row_num, expected): - df = DataFrame( - { - ("c1", 0): {x: x for x in range(5)}, - ("c1", 1): {x: x + 5 for x in range(5)}, - ("c2", 0): {x: x for x in range(5)}, - ("c2", 1): {x: x + 5 for x in range(5)}, - ("c3", 0): {x: x for x in range(5)}, - } - ) - - row_string_converter = RowStringConverter( - formatter=DataFrameFormatter(df), - multicolumn=True, - multicolumn_format="r", - multirow=True, - ) - - assert row_string_converter.get_strrow(row_num=row_num) == expected - - def test_to_latex_exceeding_float_point_double(self): - df = DataFrame(data=[[1234567890123456789]], columns=["test"]) - expected = _dedent( - r""" - \begin{tabular}{lr} - & test \\ - 0 & 1234567890123456789 \\ - \end{tabular} - """ - ) - assert df.style.to_latex() == expected + assert df.style.to_latex() == expected From 52e8c11c47d19c3166e7c94e1ef1695a842d0e9d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 24 Apr 2023 11:49:47 -0700 Subject: [PATCH 044/577] BUG: mean/median with strings (#52281) * BUG: converting string to numeric in median, mean * whatsnew, median test * troubleshoot builds * fix arraymanager build * say in whatsnew we raise TypeError --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 3 ++ pandas/core/nanops.py | 15 ++++++- pandas/tests/apply/test_invalid_arg.py | 3 ++ pandas/tests/frame/test_reductions.py | 44 +++++++++++++++---- pandas/tests/groupby/test_function.py | 6 +++ pandas/tests/groupby/test_groupby.py | 5 ++- pandas/tests/groupby/test_raises.py | 22 ++++++++-- pandas/tests/resample/test_resample_api.py | 4 +- pandas/tests/series/test_reductions.py | 49 ++++++++++++++++++++++ pandas/tests/test_nanops.py | 19 ++++++--- 10 files changed, 149 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b0e9fa2cea0ee..b7d17d1abbe91 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -313,8 +313,11 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) +- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`) - Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`) - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) +- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`34671`) - Conversion diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index ddec07c8bf890..8fddc8461dfbe 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -716,7 +716,8 @@ def nanmean( dtype_count = dtype count = _get_counts(values.shape, mask, axis, dtype=dtype_count) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) + the_sum = values.sum(axis, dtype=dtype_sum) + the_sum = _ensure_numeric(the_sum) if axis is not None and getattr(the_sum, "ndim", False): count = cast(np.ndarray, count) @@ -775,6 +776,11 @@ def get_median(x, _mask=None): dtype = values.dtype values, mask = _get_values(values, skipna, mask=mask, fill_value=0) if values.dtype.kind != "f": + if values.dtype == object: + # GH#34671 avoid casting strings to numeric + inferred = lib.infer_dtype(values) + if inferred in ["string", "mixed"]: + raise TypeError(f"Cannot convert {values} to numeric") try: values = values.astype("f8") except ValueError as err: @@ -1659,6 +1665,10 @@ def _ensure_numeric(x): if x.dtype.kind in "biu": x = x.astype(np.float64) elif x.dtype == object: + inferred = lib.infer_dtype(x) + if inferred in ["string", "mixed"]: + # GH#44008, GH#36703 avoid casting e.g. strings to numeric + raise TypeError(f"Could not convert {x} to numeric") try: x = x.astype(np.complex128) except (TypeError, ValueError): @@ -1671,6 +1681,9 @@ def _ensure_numeric(x): if not np.any(np.imag(x)): x = x.real elif not (is_float(x) or is_integer(x) or is_complex(x)): + if isinstance(x, str): + # GH#44008, GH#36703 avoid casting e.g. strings to numeric + raise TypeError(f"Could not convert string '{x}' to numeric") try: x = float(x) except (TypeError, ValueError): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 5995b78d4bea5..d75b784302676 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -244,6 +244,9 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis): def test_agg_cython_table_raises_series(series, func, expected): # GH21224 msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" + if func == "median" or func is np.nanmedian or func is np.median: + msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" + with pytest.raises(expected, match=msg): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0d352b8e34f37..096f6fe83ea88 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -169,15 +169,30 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): ): getattr(float_string_frame, opname)(axis=axis) else: - msg = "|".join( - [ - "Could not convert", - "could not convert", - "can't multiply sequence by non-int", - "unsupported operand type", - "not supported between instances of", - ] - ) + if opname in ["var", "std", "sem", "skew", "kurt"]: + msg = "could not convert string to float: 'bar'" + elif opname == "product": + if axis == 1: + msg = "can't multiply sequence by non-int of type 'float'" + else: + msg = "can't multiply sequence by non-int of type 'str'" + elif opname == "sum": + msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'" + elif opname == "mean": + if axis == 0: + # different message on different builds + msg = "|".join( + [ + r"Could not convert \['.*'\] to numeric", + "Could not convert string '(bar){30}' to numeric", + ] + ) + else: + msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'" + elif opname in ["min", "max"]: + msg = "'[><]=' not supported between instances of 'float' and 'str'" + elif opname == "median": + msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S) with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -1759,5 +1774,16 @@ def test_fails_on_non_numeric(kernel): "argument must be a string or a real number", ] ) + if kernel == "median": + # slightly different message on different builds + msg1 = ( + r"Cannot convert \[\[ " + r"\]\] to numeric" + ) + msg2 = ( + r"Cannot convert \[ " + r"\] to numeric" + ) + msg = "|".join([msg1, msg2]) with pytest.raises(TypeError, match=msg): getattr(df, kernel)(*args) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 159c620e36cdd..22fa65f3bdda1 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -262,6 +262,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "can't multiply sequence by non-int of type 'str'", ] ) + if method == "median": + msg = r"Cannot convert \['a' 'b'\] to numeric" with pytest.raises(exception, match=msg): getattr(gb, method)() else: @@ -279,6 +281,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", ] ) + if method == "median": + msg = r"Cannot convert \['a' 'b'\] to numeric" with pytest.raises(exception, match=msg): getattr(gb, method)(numeric_only=False) else: @@ -1467,6 +1471,8 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): "function is not implemented for this dtype", ] ) + if kernel == "median": + msg = r"Cannot convert \[ \] to numeric" with pytest.raises(exception, match=msg): method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cd33f031720e1..514c0fe82ff5f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -655,7 +655,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] - with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"): + msg = "Could not convert string 'dullshinyshiny' to numeric" + with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -974,6 +975,8 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): # columns when numeric_only is False klass = ValueError if agg_function in ("std", "sem") else TypeError msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"]) + if agg_function == "median": + msg = r"Cannot convert \['one' 'three' 'two'\] to numeric" with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 9b3c7543def68..55a6bc37d6046 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -147,8 +147,21 @@ def test_groupby_raises_string( "idxmin": (TypeError, "'argmin' not allowed for this dtype"), "last": (None, ""), "max": (None, ""), - "mean": (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"), - "median": (TypeError, "could not convert string to float"), + "mean": ( + TypeError, + "Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric", + ), + "median": ( + TypeError, + "|".join( + [ + r"Cannot convert \['x' 'y' 'z'\] to numeric", + r"Cannot convert \['x' 'y'\] to numeric", + r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric", + r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric", + ] + ), + ), "min": (None, ""), "ngroup": (None, ""), "nunique": (None, ""), @@ -197,7 +210,10 @@ def test_groupby_raises_string_np( klass, msg = { np.sum: (None, ""), - np.mean: (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"), + np.mean: ( + TypeError, + "Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric", + ), }[groupby_func_np] _call_and_check(klass, msg, how, gb, groupby_func_np, tuple()) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 4b86a25f9587d..6a22faa623f69 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -857,8 +857,8 @@ def test_end_and_end_day_origin( ("mean", False, "Could not convert"), ("mean", lib.no_default, "Could not convert"), ("median", True, {"num": [12.5]}), - ("median", False, "could not convert"), - ("median", lib.no_default, "could not convert"), + ("median", False, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"), + ("median", lib.no_default, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"), ("std", True, {"num": [10.606601717798213]}), ("std", False, "could not convert string to float"), ("std", lib.no_default, "could not convert string to float"), diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index eb11b62a651cc..0152303a7269a 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -129,3 +129,52 @@ def test_validate_stat_keepdims(): ) with pytest.raises(ValueError, match=msg): np.sum(ser, keepdims=True) + + +def test_mean_with_convertible_string_raises(using_array_manager): + # GH#44008 + ser = Series(["1", "2"]) + assert ser.sum() == "12" + msg = "Could not convert string '12' to numeric" + with pytest.raises(TypeError, match=msg): + ser.mean() + + df = ser.to_frame() + if not using_array_manager: + msg = r"Could not convert \['12'\] to numeric" + with pytest.raises(TypeError, match=msg): + df.mean() + + +def test_mean_dont_convert_j_to_complex(using_array_manager): + # GH#36703 + df = pd.DataFrame([{"db": "J", "numeric": 123}]) + if using_array_manager: + msg = "Could not convert string 'J' to numeric" + else: + msg = r"Could not convert \['J'\] to numeric" + with pytest.raises(TypeError, match=msg): + df.mean() + + with pytest.raises(TypeError, match=msg): + df.agg("mean") + + msg = "Could not convert string 'J' to numeric" + with pytest.raises(TypeError, match=msg): + df["db"].mean() + with pytest.raises(TypeError, match=msg): + np.mean(df["db"].astype("string").array) + + +def test_median_with_convertible_string_raises(using_array_manager): + # GH#34671 this _could_ return a string "2", but definitely not float 2.0 + msg = r"Cannot convert \['1' '2' '3'\] to numeric" + ser = Series(["1", "2", "3"]) + with pytest.raises(TypeError, match=msg): + ser.median() + + if not using_array_manager: + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" + df = ser.to_frame() + with pytest.raises(TypeError, match=msg): + df.median() diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 971535bd7d783..7d258033748b6 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -850,7 +850,9 @@ def test_ndarray(self): # Test convertible string ndarray s_values = np.array(["1", "2", "3"], dtype=object) - assert np.allclose(nanops._ensure_numeric(s_values), values) + msg = r"Could not convert \['1' '2' '3'\] to numeric" + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric(s_values) # Test non-convertible string ndarray s_values = np.array(["foo", "bar", "baz"], dtype=object) @@ -859,12 +861,19 @@ def test_ndarray(self): nanops._ensure_numeric(s_values) def test_convertable_values(self): - assert np.allclose(nanops._ensure_numeric("1"), 1.0) - assert np.allclose(nanops._ensure_numeric("1.1"), 1.1) - assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j) + with pytest.raises(TypeError, match="Could not convert string '1' to numeric"): + nanops._ensure_numeric("1") + with pytest.raises( + TypeError, match="Could not convert string '1.1' to numeric" + ): + nanops._ensure_numeric("1.1") + with pytest.raises( + TypeError, match=r"Could not convert string '1\+1j' to numeric" + ): + nanops._ensure_numeric("1+1j") def test_non_convertable_values(self): - msg = "Could not convert foo to numeric" + msg = "Could not convert string 'foo' to numeric" with pytest.raises(TypeError, match=msg): nanops._ensure_numeric("foo") From a2b6f12b67c219b006ef094edd381c3a7b3ac453 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 25 Apr 2023 01:50:42 +0100 Subject: [PATCH 045/577] BUG: interchange bitmasks not supported in interchange/from_dataframe.py (#52824) * support bitmasks in interchange * remove dead code * fixup for slice, add tests * tighten typing * reduce diff * post-merge fixup * add new whatsnew note * move to 2.0.2 * revert --------- Co-authored-by: MarcoGorelli <> --- doc/source/whatsnew/v2.0.2.rst | 2 + pandas/core/interchange/from_dataframe.py | 104 +++++++--------------- pandas/tests/interchange/test_impl.py | 26 ++++++ 3 files changed, 62 insertions(+), 70 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 0a6738cb9b3dc..09932a2d2d571 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -20,6 +20,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) +- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 998f3bc374942..45d6bdd7917c1 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -6,6 +6,8 @@ import numpy as np +from pandas.compat._optional import import_optional_dependency + import pandas as pd from pandas.core.interchange.dataframe_protocol import ( Buffer, @@ -23,7 +25,7 @@ DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, - DtypeKind.BOOL: {8: bool}, + DtypeKind.BOOL: {1: bool, 8: bool}, } @@ -154,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: buffers = col.get_buffers() data_buff, data_dtype = buffers["data"] - data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size()) + data = buffer_to_ndarray( + data_buff, data_dtype, offset=col.offset, length=col.size() + ) data = set_nulls(data, col, buffers["validity"]) return data, buffers @@ -192,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] - codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size()) + codes = buffer_to_ndarray( + codes_buff, codes_dtype, offset=col.offset, length=col.size() + ) # Doing module in order to not get ``IndexError`` for # out-of-bounds sentinel values in `codes` @@ -252,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: Endianness.NATIVE, ) # Specify zero offset as we don't want to chunk the string data - data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size()) + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) # Retrieve the offsets buffer containing the index offsets demarcating # the beginning and the ending of each string @@ -261,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # meaning that it has more elements than in the data buffer, do `col.size() + 1` # here to pass a proper offsets buffer size offsets = buffer_to_ndarray( - offset_buff, offset_dtype, col.offset, length=col.size() + 1 + offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1 ) null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): assert buffers["validity"], "Validity buffers cannot be empty for masks" valid_buff, valid_dtype = buffers["validity"] - null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size()) + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) if sentinel_val == 0: null_pos = ~null_pos @@ -356,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: getattr(ArrowCTypes, f"UINT{dtype[1]}"), Endianness.NATIVE, ), - col.offset, - col.size(), + offset=col.offset, + length=col.size(), ) data = parse_datetime_format_str(format_str, data) @@ -368,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: def buffer_to_ndarray( buffer: Buffer, dtype: tuple[DtypeKind, int, str, str], + *, + length: int, offset: int = 0, - length: int | None = None, ) -> np.ndarray: """ Build a NumPy array from the passed buffer. @@ -406,74 +415,27 @@ def buffer_to_ndarray( # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports # it since https://github.com/numpy/numpy/pull/19083 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) - data_pointer = ctypes.cast( - buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) - ) if bit_width == 1: assert length is not None, "`length` must be specified for a bit-mask buffer." - arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) - return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8) + pa = import_optional_dependency("pyarrow") + arr = pa.BooleanArray.from_buffers( + pa.bool_(), + length, + [None, pa.foreign_buffer(buffer.ptr, length)], + offset=offset, + ) + return np.asarray(arr) else: + data_pointer = ctypes.cast( + buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) + ) return np.ctypeslib.as_array( - data_pointer, shape=(buffer.bufsize // (bit_width // 8),) + data_pointer, + shape=(length,), ) -def bitmask_to_bool_ndarray( - bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 -) -> np.ndarray: - """ - Convert bit-mask to a boolean NumPy array. - - Parameters - ---------- - bitmask : np.ndarray[uint8] - NumPy array of uint8 dtype representing the bitmask. - mask_length : int - Number of elements in the mask to interpret. - first_byte_offset : int, default: 0 - Number of elements to offset from the start of the first byte. - - Returns - ------- - np.ndarray[bool] - """ - bytes_to_skip = first_byte_offset // 8 - bitmask = bitmask[bytes_to_skip:] - first_byte_offset %= 8 - - bool_mask = np.zeros(mask_length, dtype=bool) - - # Processing the first byte separately as it has its own offset - val = bitmask[0] - mask_idx = 0 - bits_in_first_byte = min(8 - first_byte_offset, mask_length) - for j in range(bits_in_first_byte): - if val & (1 << (j + first_byte_offset)): - bool_mask[mask_idx] = True - mask_idx += 1 - - # `mask_length // 8` describes how many full bytes to process - for i in range((mask_length - bits_in_first_byte) // 8): - # doing `+ 1` as we already processed the first byte - val = bitmask[i + 1] - for j in range(8): - if val & (1 << j): - bool_mask[mask_idx] = True - mask_idx += 1 - - if len(bitmask) > 1: - # Processing reminder of last byte - val = bitmask[-1] - for j in range(len(bool_mask) - mask_idx): - if val & (1 << j): - bool_mask[mask_idx] = True - mask_idx += 1 - - return bool_mask - - def set_nulls( data: np.ndarray | pd.Series, col: Column, @@ -509,7 +471,9 @@ def set_nulls( elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): assert validity, "Expected to have a validity buffer for the mask" valid_buff, valid_dtype = validity - null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size()) + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) if sentinel_val == 0: null_pos = ~null_pos elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index a9835b8641e7d..d393ba6fd3957 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -104,6 +104,32 @@ def test_large_string_pyarrow(): assert pa.Table.equals(pa.interchange.from_dataframe(result), table) +@pytest.mark.parametrize( + ("offset", "length", "expected_values"), + [ + (0, None, [3.3, float("nan"), 2.1]), + (1, None, [float("nan"), 2.1]), + (2, None, [2.1]), + (0, 2, [3.3, float("nan")]), + (0, 1, [3.3]), + (1, 1, [float("nan")]), + ], +) +def test_bitmasks_pyarrow(offset, length, expected_values): + # GH 52795 + pa = pytest.importorskip("pyarrow", "11.0.0") + + arr = [3.3, None, 2.1] + table = pa.table({"arr": arr}).slice(offset, length) + exchange_df = table.__dataframe__() + result = from_dataframe(exchange_df) + expected = pd.DataFrame({"arr": expected_values}) + tm.assert_frame_equal(result, expected) + + # check round-trip + assert pa.Table.equals(pa.interchange.from_dataframe(result), table) + + @pytest.mark.parametrize( "data", [int_data, uint_data, float_data, bool_data, datetime_data] ) From 59c24cb9fd32ba52eed46ad82560adfa92f99855 Mon Sep 17 00:00:00 2001 From: mKlepsch <37306042+mKlepsch@users.noreply.github.com> Date: Tue, 25 Apr 2023 02:52:10 +0200 Subject: [PATCH 046/577] Typing: Narrow down types of arguments (NDFrame) #10 (#52754) * improved specificationfor date_unit in to_json more precisely * improved specification mode in to_hdf * added the Literal to complib of to_hdf * improved format for to_hdf * address PR review * Update generic.py removed the new line in the import * ran "pre-commit" to fix trailing whitespace --- pandas/_typing.py | 1 + pandas/core/generic.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index e162f7f1662ee..dc3f2f54a54ca 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -419,6 +419,7 @@ def closed(self) -> bool: AlignJoin = Literal["outer", "inner", "left", "right"] DtypeBackend = Literal["pyarrow", "numpy_nullable"] +TimeUnit = Literal["s", "ms", "us", "ns"] OpenFileErrors = Literal[ "strict", "ignore", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f3de296841510..9e9f28b1dfddb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -82,6 +82,7 @@ TimedeltaConvertibleTypes, TimeNonexistent, TimestampConvertibleTypes, + TimeUnit, ValueKeyFunc, WriteBuffer, WriteExcelBuffer, @@ -2284,7 +2285,7 @@ def to_json( date_format: str | None = None, double_precision: int = 10, force_ascii: bool_t = True, - date_unit: str = "ms", + date_unit: TimeUnit = "ms", default_handler: Callable[[Any], JSONSerializable] | None = None, lines: bool_t = False, compression: CompressionOptions = "infer", @@ -2564,11 +2565,11 @@ def to_hdf( self, path_or_buf: FilePath | HDFStore, key: str, - mode: str = "a", + mode: Literal["a", "w", "r+"] = "a", complevel: int | None = None, - complib: str | None = None, + complib: Literal["zlib", "lzo", "bzip2", "blosc"] | None = None, append: bool_t = False, - format: str | None = None, + format: Literal["fixed", "table"] | None = None, index: bool_t = True, min_itemsize: int | dict[str, int] | None = None, nan_rep=None, From 072d8bbd457f4cc0c637148b6f118483b0149994 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Apr 2023 08:29:56 -0700 Subject: [PATCH 047/577] BUG: Fix pandas._libs.json __name__ (#52903) --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/_libs/src/ujson/python/ujson.c | 2 +- pandas/tests/io/json/test_ujson.py | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b7d17d1abbe91..45a5efa574bf9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -424,7 +424,7 @@ Other - Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`) - Bug in :meth:`Series.map` when giving a callable to an empty series, the returned series had ``object`` dtype. It now keeps the original dtype (:issue:`52384`) - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) -- +- Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/_libs/src/ujson/python/ujson.c b/pandas/_libs/src/ujson/python/ujson.c index c12f88d2f9354..5c87ee6dd7ddc 100644 --- a/pandas/_libs/src/ujson/python/ujson.c +++ b/pandas/_libs/src/ujson/python/ujson.c @@ -83,7 +83,7 @@ static int module_clear(PyObject *m); static void module_free(void *module); static struct PyModuleDef moduledef = {.m_base = PyModuleDef_HEAD_INIT, - .m_name = "_libjson", + .m_name = "pandas._libs.json", .m_methods = ujsonMethods, .m_size = sizeof(modulestate), .m_traverse = module_traverse, diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 6b635a4f46972..0df6b1eef72c0 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -694,6 +694,10 @@ def e(self): test_object = _TestObject(a=1, b=2, _c=3, d=4) assert ujson.decode(ujson.encode(test_object)) == {"a": 1, "b": 2, "d": 4} + def test_ujson__name__(self): + # GH 52898 + assert ujson.__name__ == "pandas._libs.json" + class TestNumpyJSONTests: @pytest.mark.parametrize("bool_input", [True, False]) From fd200e1fc08a2a3f76908df6fa5c353a9a7e902e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Apr 2023 09:46:39 -0700 Subject: [PATCH 048/577] BUG: date_range with DateOffset with nanoseconds (#52902) * date_range with freq with nano * add whatsnew and test, simplify logic --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/_libs/tslibs/offsets.pyx | 17 ++++++++------- pandas/core/arrays/datetimes.py | 18 ++++++++++++++-- .../indexes/datetimes/test_date_range.py | 21 +++++++++++++++++++ 4 files changed, 48 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 45a5efa574bf9..46f564f716b0a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -293,9 +293,9 @@ Categorical Datetimelike ^^^^^^^^^^^^ - :meth:`DatetimeIndex.map` with ``na_action="ignore"`` now works as expected. (:issue:`51644`) +- Bug in :func:`date_range` when ``freq`` was a :class:`DateOffset` with ``nanoseconds`` (:issue:`46877`) - Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`) - Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) -- Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 8bbaafa536457..0614cb2a9d8c9 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1217,7 +1217,10 @@ cdef class RelativeDeltaOffset(BaseOffset): @apply_wraps def _apply(self, other: datetime) -> datetime: + other_nanos = 0 if self._use_relativedelta: + if isinstance(other, _Timestamp): + other_nanos = other.nanosecond other = _as_datetime(other) if len(self.kwds) > 0: @@ -1226,17 +1229,17 @@ cdef class RelativeDeltaOffset(BaseOffset): # perform calculation in UTC other = other.replace(tzinfo=None) - if hasattr(self, "nanoseconds"): - td_nano = Timedelta(nanoseconds=self.nanoseconds) - else: - td_nano = Timedelta(0) - if self.n > 0: for i in range(self.n): - other = other + self._offset + td_nano + other = other + self._offset else: for i in range(-self.n): - other = other - self._offset - td_nano + other = other - self._offset + + if hasattr(self, "nanoseconds"): + other = self.n * Timedelta(nanoseconds=self.nanoseconds) + other + if other_nanos != 0: + other = Timedelta(nanoseconds=other_nanos) + other if tzinfo is not None and self._use_relativedelta: # bring tz back from UTC calculation diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index a765f4ae1b21b..b14a54a872a69 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2581,7 +2581,14 @@ def _generate_range( break # faster than cur + offset - next_date = offset._apply(cur).as_unit(unit) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Discarding nonzero nanoseconds in conversion", + category=UserWarning, + ) + next_date = offset._apply(cur) + next_date = next_date.as_unit(unit) if next_date <= cur: raise ValueError(f"Offset {offset} did not increment date") cur = next_date @@ -2595,7 +2602,14 @@ def _generate_range( break # faster than cur + offset - next_date = offset._apply(cur).as_unit(unit) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Discarding nonzero nanoseconds in conversion", + category=UserWarning, + ) + next_date = offset._apply(cur) + next_date = next_date.as_unit(unit) if next_date >= cur: raise ValueError(f"Offset {offset} did not decrement date") cur = next_date diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index ef909feccfcd3..f3f5d0dfd5ec1 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -790,6 +790,27 @@ def test_range_where_start_equal_end(self, inclusive_endpoints_fixture): tm.assert_index_equal(result, expected) + def test_freq_dateoffset_with_relateivedelta_nanos(self): + # GH 46877 + freq = DateOffset(hours=10, days=57, nanoseconds=3) + result = date_range(end="1970-01-01 00:00:00", periods=10, freq=freq, name="a") + expected = DatetimeIndex( + [ + "1968-08-02T05:59:59.999999973", + "1968-09-28T15:59:59.999999976", + "1968-11-25T01:59:59.999999979", + "1969-01-21T11:59:59.999999982", + "1969-03-19T21:59:59.999999985", + "1969-05-16T07:59:59.999999988", + "1969-07-12T17:59:59.999999991", + "1969-09-08T03:59:59.999999994", + "1969-11-04T13:59:59.999999997", + "1970-01-01T00:00:00.000000000", + ], + name="a", + ) + tm.assert_index_equal(result, expected) + class TestDateRangeTZ: """Tests for date_range with timezones""" From 3efee20817e307565f645e8bd9af07c5593b86ff Mon Sep 17 00:00:00 2001 From: Jinli Xiao <41782470+jinlixiao@users.noreply.github.com> Date: Tue, 25 Apr 2023 12:59:13 -0400 Subject: [PATCH 049/577] Call __finalize__ in Dataframe.combine and Dataframe.combine_first (#52886) * Fixed metadata propagation in Dataframe.combine and Dataframe.combine_first * fix local var override --- pandas/core/frame.py | 5 +++-- pandas/tests/generic/test_finalize.py | 22 ++++++++-------------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 051ebfff47f83..34e8c969d1b75 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8285,7 +8285,8 @@ def combine( result[col] = arr # convert_objects just in case - return self._constructor(result, index=new_index, columns=new_columns) + frame_result = self._constructor(result, index=new_index, columns=new_columns) + return frame_result.__finalize__(self, method="combine") def combine_first(self, other: DataFrame) -> DataFrame: """ @@ -8360,7 +8361,7 @@ def combiner(x, y): if dtypes: combined = combined.astype(dtypes) - return combined + return combined.__finalize__(self, method="combine_first") def update( self, diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 3c4ea5bd1fb2c..a76b6b94d719d 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -115,21 +115,15 @@ operator.methodcaller("add", pd.DataFrame(*frame_data)), ), # TODO: div, mul, etc. - pytest.param( - ( - pd.DataFrame, - frame_data, - operator.methodcaller("combine", pd.DataFrame(*frame_data), operator.add), - ), - marks=not_implemented_mark, + ( + pd.DataFrame, + frame_data, + operator.methodcaller("combine", pd.DataFrame(*frame_data), operator.add), ), - pytest.param( - ( - pd.DataFrame, - frame_data, - operator.methodcaller("combine_first", pd.DataFrame(*frame_data)), - ), - marks=not_implemented_mark, + ( + pd.DataFrame, + frame_data, + operator.methodcaller("combine_first", pd.DataFrame(*frame_data)), ), pytest.param( ( From 4c5b1740dd1eac0deb8fd80878705dc49d47c5d0 Mon Sep 17 00:00:00 2001 From: Matt Richards <45483497+m-richards@users.noreply.github.com> Date: Wed, 26 Apr 2023 03:27:05 +1000 Subject: [PATCH 050/577] BUG: Fix getitem dtype preservation with multiindexes (#51895) * BUG/TST fix dtype preservation with multindex * lint * Update pandas/tests/indexing/multiindex/test_multiindex.py Co-authored-by: Joris Van den Bossche * cleanups * switch to iloc, reindex fails in some cases * suggestions from code review * address code review comments Co-Authored-By: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Joris Van den Bossche Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/frame.py | 14 ++----------- .../indexing/multiindex/test_multiindex.py | 20 +++++++++++++++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 46f564f716b0a..cfce12c2930d7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -350,8 +350,8 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) - Bug in :meth:`MultiIndex.set_levels` not preserving dtypes for :class:`Categorical` (:issue:`52125`) -- I/O ^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 34e8c969d1b75..e04555ca30008 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3831,18 +3831,8 @@ def _getitem_multilevel(self, key): if isinstance(loc, (slice, np.ndarray)): new_columns = self.columns[loc] result_columns = maybe_droplevels(new_columns, key) - if self._is_mixed_type: - result = self.reindex(columns=new_columns) - result.columns = result_columns - else: - new_values = self._values[:, loc] - result = self._constructor( - new_values, index=self.index, columns=result_columns, copy=False - ) - if using_copy_on_write() and isinstance(loc, slice): - result._mgr.add_references(self._mgr) # type: ignore[arg-type] - - result = result.__finalize__(self) + result = self.iloc[:, loc] + result.columns = result_columns # If there is only one column being returned, and its name is # either an empty string, or a tuple with an empty string as its diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index d7285f0069c71..955c4acfd4c97 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -6,12 +6,14 @@ import pandas as pd from pandas import ( + CategoricalDtype, DataFrame, Index, MultiIndex, Series, ) import pandas._testing as tm +from pandas.core.arrays.boolean import BooleanDtype class TestMultiIndexBasic: @@ -207,6 +209,24 @@ def test_multiindex_with_na_missing_key(self): with pytest.raises(KeyError, match="missing_key"): df[[("missing_key",)]] + def test_multiindex_dtype_preservation(self): + # GH51261 + columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]) + df = DataFrame(["value"], columns=columns).astype("category") + df_no_multiindex = df["A"] + assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype) + + # geopandas 1763 analogue + df = DataFrame( + [[1, 0], [0, 1]], + columns=[ + ["foo", "foo"], + ["location", "location"], + ["x", "y"], + ], + ).assign(bools=Series([True, False], dtype="boolean")) + assert isinstance(df["bools"].dtype, BooleanDtype) + def test_multiindex_from_tuples_with_nan(self): # GH#23578 result = MultiIndex.from_tuples([("a", "b", "c"), np.nan, ("d", "", "")]) From 8ca7f9fd0e61696f9990a8e11f2ebc7bf43229cc Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 25 Apr 2023 18:28:47 +0100 Subject: [PATCH 051/577] REF: refactor ArrowExtensionArray._reduce (#52890) * REF: refactor ArrowExtensionArray._reduce * fix * add return type to _reduce_pyarrow --- pandas/core/arrays/arrow/array.py | 39 +++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 51d6fa74ea94e..55c6b74e495c0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1223,9 +1223,9 @@ def _accumulate( return type(self)(result) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ - Return a scalar result of performing the reduction operation. + Return a pyarrow scalar result of performing the reduction operation. Parameters ---------- @@ -1241,7 +1241,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): Returns ------- - scalar + pyarrow scalar Raises ------ @@ -1321,7 +1321,7 @@ def pyarrow_meth(data, skip_nulls, **kwargs): # GH 52679: Use quantile instead of approximate_median; returns array result = result[0] if pc.is_null(result).as_py(): - return self.dtype.na_value + return result if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): result = result.cast(pa_type) @@ -1341,6 +1341,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs): # i.e. timestamp result = result.cast(pa.duration(pa_type.unit)) + return result + + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + """ + Return a scalar result of performing the reduction operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew }. + skipna : bool, default True + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the reduction function. + Currently, `ddof` is the only supported kwarg. + + Returns + ------- + scalar + + Raises + ------ + TypeError : subclass does not define reductions + """ + result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) + + if pc.is_null(result).as_py(): + return self.dtype.na_value + return result.as_py() def __setitem__(self, key, value) -> None: From 56778824a2d6870e1118886667c3c74bdbd7cb62 Mon Sep 17 00:00:00 2001 From: Nirav <61644078+srkds@users.noreply.github.com> Date: Wed, 26 Apr 2023 00:19:01 +0530 Subject: [PATCH 052/577] TST: apply returns list (#52912) --- pandas/tests/apply/test_frame_apply.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 0397f8cae3ac7..5a2574e62b41e 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1433,6 +1433,13 @@ def test_apply_on_empty_dataframe(): tm.assert_series_equal(result, expected) +def test_apply_return_list(): + df = DataFrame({"a": [1, 2], "b": [2, 3]}) + result = df.apply(lambda x: [x.values]) + expected = DataFrame({"a": [[1, 2]], "b": [[2, 3]]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "test, constant", [ From 0ee8d138a7cb8db4ab9d0ae9ad7a9d91b518d78f Mon Sep 17 00:00:00 2001 From: shteken Date: Tue, 25 Apr 2023 20:51:30 +0200 Subject: [PATCH 053/577] TST: Add test_datetime_object_multiindex test (#52913) add test_datetime_object_multiindex test --- pandas/tests/test_multilevel.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index f8e1128042dbb..199307dd138ca 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1,3 +1,5 @@ +import datetime + import numpy as np import pytest @@ -266,6 +268,28 @@ def test_subsets_multiindex_dtype(self): result = df.a.b.dtypes tm.assert_series_equal(result, expected) + def test_datetime_object_multiindex(self): + data_dic = { + (0, datetime.date(2018, 3, 3)): {"A": 1, "B": 10}, + (0, datetime.date(2018, 3, 4)): {"A": 2, "B": 11}, + (1, datetime.date(2018, 3, 3)): {"A": 3, "B": 12}, + (1, datetime.date(2018, 3, 4)): {"A": 4, "B": 13}, + } + result = DataFrame.from_dict(data_dic, orient="index") + data = {"A": [1, 2, 3, 4], "B": [10, 11, 12, 13]} + index = [ + [0, 0, 1, 1], + [ + datetime.date(2018, 3, 3), + datetime.date(2018, 3, 4), + datetime.date(2018, 3, 3), + datetime.date(2018, 3, 4), + ], + ] + expected = DataFrame(data=data, index=index) + + tm.assert_frame_equal(result, expected) + class TestSorted: """everything you wanted to test about sorting""" From aeb1eee4eb1fa9e64fbce2112a91fbf0b4c80121 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Apr 2023 13:10:28 -0700 Subject: [PATCH 054/577] CI: Bump GHA versions (#52916) --- .github/actions/run-tests/action.yml | 4 ++-- .github/actions/setup-conda/action.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index 2a7601f196ec4..fd7c3587f2254 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,7 +7,7 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Test results path: test-data.xml @@ -19,7 +19,7 @@ runs: if: failure() - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3 with: flags: unittests name: codecov-pandas diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 329dc24d466b4..700197e400c5f 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -13,7 +13,7 @@ runs: using: composite steps: - name: Install ${{ inputs.environment-file }} - uses: mamba-org/provision-with-micromamba@v12 + uses: mamba-org/provision-with-micromamba@v15 with: environment-file: ${{ inputs.environment-file }} environment-name: ${{ inputs.environment-name }} From 88162739820360d6a5c8f1af03c3468b9e3c173b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 25 Apr 2023 13:11:15 -0700 Subject: [PATCH 055/577] TYP: Use Protocol for ReaderBase (#52915) --- pandas/io/sas/sasreader.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index d56f4c7ebc695..2a395f790a5b5 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -3,13 +3,10 @@ """ from __future__ import annotations -from abc import ( - ABCMeta, - abstractmethod, -) from typing import ( TYPE_CHECKING, Hashable, + Protocol, overload, ) @@ -31,19 +28,16 @@ from pandas import DataFrame -# TODO(PY38): replace with Protocol in Python 3.8 -class ReaderBase(metaclass=ABCMeta): +class ReaderBase(Protocol): """ Protocol for XportReader and SAS7BDATReader classes. """ - @abstractmethod def read(self, nrows: int | None = None) -> DataFrame: - pass + ... - @abstractmethod def close(self) -> None: - pass + ... def __enter__(self) -> ReaderBase: return self From 381fe9d4f5a60c496c9721c843ed7b9f0c02a705 Mon Sep 17 00:00:00 2001 From: gmollard Date: Tue, 25 Apr 2023 22:12:06 +0200 Subject: [PATCH 056/577] TYP: Added IntervalClosedType where needed (#52894) --- pandas/_libs/interval.pyi | 2 +- pandas/_libs/interval.pyx | 2 +- pandas/core/arrays/interval.py | 2 +- pandas/core/dtypes/dtypes.py | 7 ++++--- pandas/core/indexes/interval.py | 2 +- pandas/io/formats/style.py | 5 +++-- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/interval.pyi b/pandas/_libs/interval.pyi index 4c36246e04d23..587fdf84f2f85 100644 --- a/pandas/_libs/interval.pyi +++ b/pandas/_libs/interval.pyi @@ -148,7 +148,7 @@ class Interval(IntervalMixin, Generic[_OrderableT]): def intervals_to_interval_bounds( intervals: np.ndarray, validate_closed: bool = ... -) -> tuple[np.ndarray, np.ndarray, str]: ... +) -> tuple[np.ndarray, np.ndarray, IntervalClosedType]: ... class IntervalTree(IntervalMixin): def __init__( diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 14b7baf7f5a08..fe405b98f218c 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -612,7 +612,7 @@ def intervals_to_interval_bounds(ndarray intervals, bint validate_closed=True): tuple of left : ndarray right : ndarray - closed: str + closed: IntervalClosedType """ cdef: object closed = None, interval diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index b8442205c331e..2ba66006a0645 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -226,7 +226,7 @@ def ndim(self) -> Literal[1]: def __new__( cls, data, - closed=None, + closed: IntervalClosedType | None = None, dtype: Dtype | None = None, copy: bool = False, verify_integrity: bool = True, diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 479bfdc557a07..014d7a4c5a330 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -53,6 +53,7 @@ from pandas._typing import ( Dtype, DtypeObj, + IntervalClosedType, Ordered, npt, type_t, @@ -1099,7 +1100,7 @@ class IntervalDtype(PandasExtensionDtype): _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} - def __new__(cls, subtype=None, closed: str_type | None = None): + def __new__(cls, subtype=None, closed: IntervalClosedType | None = None): from pandas.core.dtypes.common import ( is_string_dtype, pandas_dtype, @@ -1137,7 +1138,7 @@ def __new__(cls, subtype=None, closed: str_type | None = None): "'closed' keyword does not match value " "specified in dtype string" ) - closed = gd["closed"] + closed = gd["closed"] # type: ignore[assignment] try: subtype = pandas_dtype(subtype) @@ -1175,7 +1176,7 @@ def _can_hold_na(self) -> bool: return True @property - def closed(self): + def closed(self) -> IntervalClosedType: return self._closed @property diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 13c87a9d06b66..a667506135643 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -220,7 +220,7 @@ class IntervalIndex(ExtensionIndex): def __new__( cls, data, - closed=None, + closed: IntervalClosedType | None = None, dtype: Dtype | None = None, copy: bool = False, name: Hashable = None, diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index e2c5ed2ea92b6..8ddf14175a6da 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -65,6 +65,7 @@ AxisInt, FilePath, IndexLabel, + IntervalClosedType, Level, QuantileInterpolation, Scalar, @@ -3185,7 +3186,7 @@ def highlight_between( axis: Axis | None = 0, left: Scalar | Sequence | None = None, right: Scalar | Sequence | None = None, - inclusive: str = "both", + inclusive: IntervalClosedType = "both", props: str | None = None, ) -> Styler: """ @@ -3294,7 +3295,7 @@ def highlight_quantile( q_left: float = 0.0, q_right: float = 1.0, interpolation: QuantileInterpolation = "linear", - inclusive: str = "both", + inclusive: IntervalClosedType = "both", props: str | None = None, ) -> Styler: """ From a80d657641c86b450a30d4b75c3da1a8daf38505 Mon Sep 17 00:00:00 2001 From: VomV Date: Tue, 25 Apr 2023 21:14:05 +0100 Subject: [PATCH 057/577] Modified the doc string of value_counts for DataFrame and series when sort arg is set to false (#52857) --- pandas/core/base.py | 2 +- pandas/core/frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index f907089f77132..5bc0fc3e1af63 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -825,7 +825,7 @@ def value_counts( If True then the object returned will contain the relative frequencies of the unique values. sort : bool, default True - Sort by frequencies. + Sort by frequencies when True. Preserve the order of the data when False. ascending : bool, default False Sort in ascending order. bins : int, optional diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e04555ca30008..838a34adeaf82 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6964,7 +6964,7 @@ def value_counts( normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True - Sort by frequencies. + Sort by frequencies when True. Sort by DataFrame column values when False. ascending : bool, default False Sort in ascending order. dropna : bool, default True From 36b302118b2b495306a6a64341d206715d10a360 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 26 Apr 2023 16:30:37 +0200 Subject: [PATCH 058/577] CI: Change development python version to 3.10 (#51133) * CI: Change development python version to 3.10 * Update checks * Remove strict * Remove strict * Fixes * Add dt * Switch python to 3.9 * Remove * Fix * Try attribute * Adjust * Fix mypy * Try fixing doc build * Fix mypy * Fix stubtest * Remove workflow file * Rename back * Update * Rename * Rename * Change python version * Remove * Fix doc errors * Remove pypy * Update ci/deps/actions-pypy-39.yaml Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> * Revert pypy removal * Remove again * Fix * Change to 3.9 * Address --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .circleci/config.yml | 2 +- .github/workflows/code-checks.yml | 2 +- .github/workflows/package-checks.yml | 2 +- .github/workflows/sdist.yml | 4 +- .github/workflows/unit-tests.yml | 16 ++--- .github/workflows/wheels.yml | 4 +- ci/deps/actions-38.yaml | 60 ------------------- ...yaml => actions-39-downstream_compat.yaml} | 2 +- ....yaml => actions-39-minimum_versions.yaml} | 2 +- ...ions-pypy-38.yaml => actions-pypy-39.yaml} | 2 +- ...cle-38-arm64.yaml => circle-39-arm64.yaml} | 2 +- doc/source/conf.py | 1 + .../development/contributing_gitpod.rst | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v2.1.0.rst | 5 ++ environment.yml | 4 +- pandas/_libs/tslibs/timestamps.pyi | 4 +- pandas/conftest.py | 6 +- pandas/core/generic.py | 3 +- pandas/plotting/_core.py | 4 +- pandas/tests/arrays/test_datetimes.py | 9 ++- .../indexes/datetimes/test_constructors.py | 2 + .../tests/indexes/datetimes/test_timezones.py | 3 +- .../scalar/timestamp/test_constructors.py | 5 +- .../tests/scalar/timestamp/test_timezones.py | 3 +- pyproject.toml | 13 ++-- requirements-dev.txt | 2 +- scripts/run_stubtest.py | 4 +- scripts/validate_min_versions_in_sync.py | 5 ++ 29 files changed, 71 insertions(+), 104 deletions(-) delete mode 100644 ci/deps/actions-38.yaml rename ci/deps/{actions-38-downstream_compat.yaml => actions-39-downstream_compat.yaml} (98%) rename ci/deps/{actions-38-minimum_versions.yaml => actions-39-minimum_versions.yaml} (98%) rename ci/deps/{actions-pypy-38.yaml => actions-pypy-39.yaml} (86%) rename ci/deps/{circle-38-arm64.yaml => circle-39-arm64.yaml} (98%) diff --git a/.circleci/config.yml b/.circleci/config.yml index e704c37df3e45..e7322e748662f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ jobs: image: ubuntu-2004:2022.04.1 resource_class: arm.large environment: - ENV_FILE: ci/deps/circle-38-arm64.yaml + ENV_FILE: ci/deps/circle-39-arm64.yaml PYTEST_WORKERS: auto PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" PYTEST_TARGET: "pandas" diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index d6d43a8bfc13b..ab8f873e9b70b 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -179,7 +179,7 @@ jobs: id: setup_python uses: actions/setup-python@v4 with: - python-version: '3.8' + python-version: '3.10' cache: 'pip' cache-dependency-path: 'requirements-dev.txt' diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index fa1b5e5d4fba3..7130bed21d5ff 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -38,7 +38,7 @@ jobs: id: setup_python uses: actions/setup-python@v4 with: - python-version: '3.8' + python-version: '3.10' - name: Install required dependencies run: | diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml index 460369f45e900..957e7103f4ff6 100644 --- a/.github/workflows/sdist.yml +++ b/.github/workflows/sdist.yml @@ -29,7 +29,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{matrix.python-version}}-sdist @@ -80,8 +80,6 @@ jobs: - name: Force oldest supported NumPy run: | case "${{matrix.python-version}}" in - 3.8) - pip install numpy==1.21.6 ;; 3.9) pip install numpy==1.21.6 ;; 3.10) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 31e2095624347..69baa3ccd6f34 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -26,19 +26,19 @@ jobs: timeout-minutes: 180 strategy: matrix: - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] + env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml] # Prevent the include jobs from overriding other jobs pattern: [""] include: - name: "Downstream Compat" - env_file: actions-38-downstream_compat.yaml + env_file: actions-39-downstream_compat.yaml pattern: "not slow and not network and not single_cpu" pytest_target: "pandas/tests/test_downstream.py" - name: "Minimum Versions" - env_file: actions-38-minimum_versions.yaml + env_file: actions-39-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" - name: "Locale: it_IT" - env_file: actions-38.yaml + env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" extra_apt: "language-pack-it" # Use the utf8 version as the default, it has no bad side-effect. @@ -48,7 +48,7 @@ jobs: # It will be temporarily activated during tests with locale.setlocale extra_loc: "it_IT" - name: "Locale: zh_CN" - env_file: actions-38.yaml + env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" extra_apt: "language-pack-zh-hans" # Use the utf8 version as the default, it has no bad side-effect. @@ -62,7 +62,7 @@ jobs: pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" - name: "Pypy" - env_file: actions-pypy-38.yaml + env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" - name: "Numpy Dev" @@ -173,7 +173,7 @@ jobs: strategy: matrix: os: [macos-latest, windows-latest] - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] + env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml] fail-fast: false runs-on: ${{ matrix.os }} name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} @@ -227,7 +227,7 @@ jobs: fi - name: Build environment and Run Tests run: | - /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp39-cp39/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir --no-deps -U pip wheel setuptools python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index a42957c1cc942..79dd9222fb90d 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -55,7 +55,7 @@ jobs: - [windows-2019, win_amd64] - [windows-2019, win32] # TODO: support PyPy? - python: [["cp38", "3.8"], ["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]# "pp38", "pp39"] + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]# "pp39"] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -152,7 +152,7 @@ jobs: auto-update-conda: true # Really doesn't matter what version we upload with # just the version we test with - python-version: '3.8' + python-version: '3.10' channels: conda-forge channel-priority: true # mamba fails to solve, also we really don't need this since we're just installing python diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml deleted file mode 100644 index df7d0277f3ba9..0000000000000 --- a/ci/deps/actions-38.yaml +++ /dev/null @@ -1,60 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - # test dependencies - - pytest>=7.0.0 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4>=4.11.1 - - blosc>=1.21.0 - - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 - - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1 - - numba>=0.55.2 - - numexpr>=2.8.0 - - odfpy>=1.4.1 - - qtpy>=2.2.0 - - openpyxl<3.1.1, >=3.0.10 - - pandas-gbq>=0.17.5 - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 - - pymysql>=1.0.2 - - pyreadstat>=1.1.5 - - pytables>=3.7.0 - - python-snappy>=0.6.1 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 - - - pip: - - pyqt5>=5.15.6 - - tzdata>=2022.1 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-39-downstream_compat.yaml similarity index 98% rename from ci/deps/actions-38-downstream_compat.yaml rename to ci/deps/actions-39-downstream_compat.yaml index 670d7c37dc4d2..241adef3367a0 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-39-downstream_compat.yaml @@ -3,7 +3,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.9 # build dependencies - versioneer[toml] diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml similarity index 98% rename from ci/deps/actions-38-minimum_versions.yaml rename to ci/deps/actions-39-minimum_versions.yaml index 96c6a0fd6eb2e..61752cbfa54b2 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -4,7 +4,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8.0 + - python=3.9 # build dependencies - versioneer[toml] diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-39.yaml similarity index 86% rename from ci/deps/actions-pypy-38.yaml rename to ci/deps/actions-pypy-39.yaml index 981399dcd4b7c..64774e776056f 100644 --- a/ci/deps/actions-pypy-38.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -5,7 +5,7 @@ dependencies: # TODO: Add the rest of the dependencies in here # once the other plentiful failures/segfaults # with base pandas has been dealt with - - python=3.8[build=*_pypy] # TODO: use this once pypy3.8 is available + - python=3.9[build=*_pypy] # build dependencies - versioneer[toml] diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-39-arm64.yaml similarity index 98% rename from ci/deps/circle-38-arm64.yaml rename to ci/deps/circle-39-arm64.yaml index 5102b2ca55404..42f9994b64157 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-39-arm64.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.9 # build dependencies - versioneer[toml] diff --git a/doc/source/conf.py b/doc/source/conf.py index c73a91aa90365..d808d60b1ac95 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -76,6 +76,7 @@ # to ensure that include files (partial pages) aren't built, exclude them # https://github.com/sphinx-doc/sphinx/issues/1965#issuecomment-124732907 "**/includes/**", + "**/api/pandas.Series.dt.rst", ] try: import nbconvert diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst index c591be5425db9..042a2f316cd42 100644 --- a/doc/source/development/contributing_gitpod.rst +++ b/doc/source/development/contributing_gitpod.rst @@ -29,7 +29,7 @@ you do not have an account yet, you will need to create one first. To get started just login at `Gitpod`_, and grant the appropriate permissions to GitHub. -We have built a python 3.8 environment and all development dependencies will +We have built a python 3.10 environment and all development dependencies will install when the environment starts. diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 49ec242f6bb96..1770d759dde4d 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -20,7 +20,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.8, 3.9, 3.10 and 3.11. +Officially Python 3.9, 3.10 and 3.11. Installing pandas ----------------- diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index cfce12c2930d7..b10dd876050ae 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -124,6 +124,11 @@ Backwards incompatible API changes .. _whatsnew_210.api_breaking.deps: +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas 2.1.0 supports Python 3.9 and higher. + Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Some minimum supported versions of dependencies were updated. diff --git a/environment.yml b/environment.yml index bde8c46bffd97..1e30c51537fa0 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.10 - pip # build dependencies @@ -38,7 +38,7 @@ dependencies: - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 # pin for "Run checks on imported code" job - - openpyxl<3.1.1, >=3.0.7 + - openpyxl>=3.0.10 - odfpy>=1.4.1 - py - psycopg2>=2.9.3 diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 547422f5ec55c..9ba75c8485ac7 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -158,7 +158,9 @@ class Timestamp(datetime): def __hash__(self) -> int: ... def weekday(self) -> int: ... def isoweekday(self) -> int: ... - def isocalendar(self) -> tuple[int, int, int]: ... + # Return type "Tuple[int, int, int]" of "isocalendar" incompatible with return + # type "_IsoCalendarDate" in supertype "date" + def isocalendar(self) -> tuple[int, int, int]: ... # type: ignore[override] @property def is_leap_year(self) -> bool: ... @property diff --git a/pandas/conftest.py b/pandas/conftest.py index 77d2f4802c08f..9ede9e65a6839 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -84,7 +84,7 @@ zoneinfo = None if compat.PY39: # Import "zoneinfo" could not be resolved (reportMissingImports) - import zoneinfo # type: ignore[no-redef] + import zoneinfo # type: ignore[assignment] # Although zoneinfo can be imported in Py39, it is effectively # "not available" without tzdata/IANA tz data. @@ -1964,7 +1964,9 @@ def using_copy_on_write() -> bool: warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] if zoneinfo is not None: - warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) + warsaws.append( + zoneinfo.ZoneInfo("Europe/Warsaw") # pyright: ignore[reportGeneralTypeIssues] + ) @pytest.fixture(params=warsaws) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9e9f28b1dfddb..b6ade0728e075 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11756,8 +11756,7 @@ def __iand__(self, other) -> Self: @final def __ior__(self, other) -> Self: - # error: Unsupported left operand type for | ("Type[NDFrame]") - return self._inplace_method(other, type(self).__or__) # type: ignore[operator] + return self._inplace_method(other, type(self).__or__) @final def __ixor__(self, other) -> Self: diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 6ef33c3d58306..8dd3e4385a383 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1831,7 +1831,9 @@ def _load_backend(backend: str) -> types.ModuleType: if hasattr(eps, "select"): entry = eps.select(group=key) # pyright: ignore[reportGeneralTypeIssues] else: - entry = eps.get(key, ()) + # Argument 2 to "get" of "dict" has incompatible type "Tuple[]"; + # expected "EntryPoints" [arg-type] + entry = eps.get(key, ()) # type: ignore[arg-type] for entry_point in entry: found_backend = entry_point.name == backend if found_backend: diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 4bd6fd745d56d..b8cd8a5546f05 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -1,13 +1,16 @@ """ Tests for DatetimeArray """ +from __future__ import annotations + from datetime import timedelta import operator try: from zoneinfo import ZoneInfo except ImportError: - ZoneInfo = None + # Cannot assign to a type + ZoneInfo = None # type: ignore[misc, assignment] import numpy as np import pytest @@ -712,7 +715,9 @@ def test_tz_localize_t2d(self): # no tzdata pass else: - easts.append(tz) + # Argument 1 to "append" of "list" has incompatible type "ZoneInfo"; + # expected "str" + easts.append(tz) # type: ignore[arg-type] @pytest.mark.parametrize("tz", easts) def test_iter_zoneinfo_fold(self, tz): diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index f657cc71e6346..6d18a292061b9 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import ( datetime, timedelta, diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 05700841de7e1..6f3c83b999e94 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -22,7 +22,8 @@ try: from zoneinfo import ZoneInfo except ImportError: - ZoneInfo = None + # Cannot assign to a type [misc] + ZoneInfo = None # type: ignore[misc, assignment] from pandas._libs.tslibs import ( conversion, diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index b855232179b51..4851612392e68 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -848,7 +848,10 @@ def test_timestamp_constructor_retain_fold(tz, fold): _tzs = ["dateutil/Europe/London"] if PY39: try: - _tzs = ["dateutil/Europe/London", zoneinfo.ZoneInfo("Europe/London")] + _tzs = [ + "dateutil/Europe/London", + zoneinfo.ZoneInfo("Europe/London"), # type: ignore[list-item] + ] except zoneinfo.ZoneInfoNotFoundError: pass diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 820b2e17a9d3f..3a6953af4337e 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -34,7 +34,8 @@ try: from zoneinfo import ZoneInfo except ImportError: - ZoneInfo = None + # Cannot assign to a type + ZoneInfo = None # type: ignore[misc, assignment] class TestTimestampTZOperations: diff --git a/pyproject.toml b/pyproject.toml index 8b5603ecb7c7b..aacf8649add35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ authors = [ { name = 'The Pandas Development Team', email='pandas-dev@python.org' }, ] license = {file = 'LICENSE'} -requires-python = '>=3.8' +requires-python = '>=3.9' dependencies = [ "numpy>=1.21.6; python_version<'3.11'", "numpy>=1.23.2; python_version>='3.11'", @@ -39,7 +39,6 @@ classifiers = [ 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', @@ -171,7 +170,7 @@ select = "*-win32" environment = { IS_32_BIT="true" } [tool.black] -target-version = ['py38', 'py39'] +target-version = ['py39', 'py310'] required-version = '23.1.0' exclude = ''' ( @@ -194,7 +193,7 @@ exclude = ''' [tool.ruff] line-length = 88 update-check = false -target-version = "py38" +target-version = "py310" fix = true unfixable = ["E711"] @@ -257,6 +256,8 @@ ignore = [ "B023", # Functions defined inside a loop must not use variables redefined in the loop # "B301", # not yet implemented + # Only works with python >=3.10 + "B905", # Too many arguments to function call "PLR0913", # Too many returns @@ -471,7 +472,7 @@ follow_imports_for_stubs = false no_site_packages = false no_silence_site_packages = false # Platform configuration -python_version = "3.8" +python_version = "3.10" platform = "linux-64" # Disallow dynamic typing disallow_any_unimported = false # TODO @@ -549,7 +550,7 @@ skip_glob = "env" skip = "pandas/__init__.py" [tool.pyright] -pythonVersion = "3.8" +pythonVersion = "3.10" typeCheckingMode = "basic" include = ["pandas", "typings"] exclude = ["pandas/tests", "pandas/io/clipboard", "pandas/util/version"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 30189c35fcbb5..d2f024886a129 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -27,7 +27,7 @@ lxml>=4.8.0 matplotlib>=3.6.1 numba>=0.55.2 numexpr>=2.8.0 -openpyxl<3.1.1, >=3.0.7 +openpyxl>=3.0.10 odfpy>=1.4.1 py psycopg2-binary>=2.9.3 diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py index db7a327f231b5..dedcdb5532593 100644 --- a/scripts/run_stubtest.py +++ b/scripts/run_stubtest.py @@ -49,8 +49,8 @@ "pandas._libs.lib._NoDefault.no_default", # internal type alias (should probably be private) "pandas._libs.lib.ndarray_obj_2d", - # workaround for mypy (cache_readonly = property) - "pandas._libs.properties.cache_readonly.__get__", + # runtime argument "owner" has a default value but stub argument does not + "pandas._libs.properties.AxisProperty.__get__", "pandas._libs.properties.cache_readonly.deleter", "pandas._libs.properties.cache_readonly.getter", "pandas._libs.properties.cache_readonly.setter", diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index e0182ebaaee60..9a6d97a222000 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -124,6 +124,11 @@ def get_yaml_map_from( yaml_package, yaml_version2 = yaml_dependency.split(operator) yaml_version2 = operator + yaml_version2 yaml_map[yaml_package] = [yaml_version1, yaml_version2] + elif "[build=*_pypy]" in dependency: + search_text = search_text.replace("[build=*_pypy]", "") + yaml_package, yaml_version = search_text.split(operator) + yaml_version = operator + yaml_version + yaml_map[yaml_package] = [yaml_version] elif operator is not None: yaml_package, yaml_version = search_text.split(operator) yaml_version = operator + yaml_version From b0530e9561a45ec7e433d4d687b1806a5fe0f415 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 26 Apr 2023 08:43:24 -0700 Subject: [PATCH 059/577] DEPS: Address numpy deprecation of len 1 arrays assignment (#52906) * DEPS: Address numpy deprecation of len 1 arrays assignment * Address other failures, and investigate csv failure * Address csv error, undo one fix * Undo whitespace * Turn into 0D array --- pandas/core/indexing.py | 7 +++++++ pandas/core/internals/base.py | 4 ++++ pandas/core/internals/blocks.py | 4 +++- pandas/io/parsers/base_parser.py | 10 +++++++++- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 5 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6aecfe5267e0c..0f1cd397f2dd1 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1753,6 +1753,13 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): if not isinstance(value, ABCSeries): # if not Series (in which case we need to align), # we can short-circuit + if ( + isinstance(arr, np.ndarray) + and arr.ndim == 1 + and len(arr) == 1 + ): + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + arr = arr[0, ...] empty_value[indexer[0]] = arr self.obj[key] = empty_value return diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 523dee97a3c5c..8f771221c8890 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -187,6 +187,10 @@ def setitem_inplace(self, indexer, value) -> None: # dt64/td64, which do their own validation. value = np_can_hold_element(arr.dtype, value) + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + value = value[0, ...] + arr[indexer] = value def grouped_reduce(self, func): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ff022cb047f3d..94670fe47036b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1063,7 +1063,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: self = self.make_block_same_class( values.T if values.ndim == 2 else values ) - + if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + casted = casted[0, ...] values[indexer] = casted return self diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 99fbfe46d22fc..0bac882756a91 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1114,6 +1114,12 @@ def _make_date_converter( if date_parser is not lib.no_default and date_format is not None: raise TypeError("Cannot use both 'date_parser' and 'date_format'") + def unpack_if_single_element(arg): + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1: + return arg[0] + return arg + def converter(*date_cols, col: Hashable): if date_parser is lib.no_default: strs = parsing.concat_date_cols(date_cols) @@ -1137,7 +1143,9 @@ def converter(*date_cols, col: Hashable): else: try: result = tools.to_datetime( - date_parser(*date_cols), errors="ignore", cache=cache_dates + date_parser(*(unpack_if_single_element(arg) for arg in date_cols)), + errors="ignore", + cache=cache_dates, ) if isinstance(result, datetime.datetime): raise Exception("scalar parser") diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 48cc1518937b3..551642ff64fd5 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -627,7 +627,7 @@ def test_categorical_transformers( result = getattr(gb_keepna, transformation_func)(*args) expected = getattr(gb_dropna, transformation_func)(*args) for iloc, value in zip( - df[df["x"].isnull()].index.tolist(), null_group_result.values + df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel() ): if expected.ndim == 1: expected.iloc[iloc] = value From 079acdbd0fd593b8dbb547646a6df9776616a55e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 26 Apr 2023 17:46:36 +0200 Subject: [PATCH 060/577] DOC: Remove versionadded/changed for 1.1.0 (#52905) * DOC: Remove versionadded/changed for 1.1.0 * Update contributing_codebase.rst * Remove double line break * Fix --- .../development/contributing_codebase.rst | 4 +- doc/source/user_guide/basics.rst | 6 --- doc/source/user_guide/boolean.rst | 2 - doc/source/user_guide/dsintro.rst | 2 - doc/source/user_guide/groupby.rst | 2 - doc/source/user_guide/indexing.rst | 2 - doc/source/user_guide/io.rst | 3 -- doc/source/user_guide/reshaping.rst | 2 - doc/source/user_guide/text.rst | 4 -- doc/source/user_guide/timeseries.rst | 8 ---- doc/source/user_guide/visualization.rst | 2 - doc/source/user_guide/window.rst | 6 --- pandas/_libs/testing.pyx | 6 --- pandas/_libs/tslibs/timestamps.pyx | 2 - pandas/_testing/asserters.py | 26 ----------- pandas/core/arrays/datetimes.py | 2 - pandas/core/common.py | 5 -- pandas/core/frame.py | 26 ----------- pandas/core/generic.py | 30 ++---------- pandas/core/groupby/groupby.py | 46 +++++++------------ pandas/core/groupby/grouper.py | 4 -- pandas/core/indexes/accessors.py | 2 - pandas/core/indexes/base.py | 2 - pandas/core/resample.py | 1 - pandas/core/reshape/tile.py | 2 - pandas/core/series.py | 9 ---- pandas/core/shared_docs.py | 6 --- pandas/core/strings/accessor.py | 2 - pandas/core/tools/timedeltas.py | 5 +- pandas/core/window/ewm.py | 4 -- pandas/errors/__init__.py | 2 - pandas/io/common.py | 6 +-- pandas/io/feather_format.py | 1 - pandas/io/formats/style.py | 2 - pandas/io/gbq.py | 1 - pandas/io/parsers/readers.py | 6 +-- pandas/io/pytables.py | 4 -- pandas/io/stata.py | 6 --- pandas/plotting/_core.py | 12 ----- 39 files changed, 29 insertions(+), 234 deletions(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index e99dbbde3db85..184060d3cf697 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -959,9 +959,9 @@ directive is used. The sphinx syntax for that is: .. code-block:: rst - .. versionadded:: 1.1.0 + .. versionadded:: 2.1.0 -This will put the text *New in version 1.1.0* wherever you put the sphinx +This will put the text *New in version 2.1.0* wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method (`example `__) or a new keyword argument (`example `__). diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 48fcaf85f0f59..989d5128d7d08 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -686,8 +686,6 @@ of a 1D array of values. It can also be used as a function on regular arrays: s.value_counts() pd.value_counts(data) -.. versionadded:: 1.1.0 - The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns. By default all columns are used but a subset can be selected using the ``subset`` argument. @@ -1812,8 +1810,6 @@ used to sort a pandas object by its index levels. .. _basics.sort_index_key: -.. versionadded:: 1.1.0 - Sorting by index also supports a ``key`` parameter that takes a callable function to apply to the index being sorted. For ``MultiIndex`` objects, the key is applied per-level to the levels specified by ``level``. @@ -1867,8 +1863,6 @@ argument: .. _basics.sort_value_key: -.. versionadded:: 1.1.0 - Sorting also supports a ``key`` parameter that takes a callable function to apply to the values being sorted. diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index a08a49bc2359c..3c361d4de17e5 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -24,8 +24,6 @@ Indexing with NA values pandas allows indexing with ``NA`` values in a boolean array, which are treated as ``False``. -.. versionchanged:: 1.0.2 - .. ipython:: python :okexcept: diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 9aa6423908cfd..01ec336f7aae5 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -413,8 +413,6 @@ first ``namedtuple``, a ``ValueError`` is raised. From a list of dataclasses ~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 1.1.0 - Data Classes as introduced in `PEP557 `__, can be passed into the DataFrame constructor. Passing a list of dataclasses is equivalent to passing a list of dictionaries. diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 88727bc8f9025..85a6b04f834e1 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -218,8 +218,6 @@ For example, the groups created by ``groupby()`` below are in the order they app .. _groupby.dropna: -.. versionadded:: 1.1.0 - GroupBy dropna ^^^^^^^^^^^^^^ diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 4a8be77d23ea2..77eee8e58a5e8 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -371,8 +371,6 @@ For getting values with a boolean array: NA values in a boolean array propagate as ``False``: -.. versionchanged:: 1.0.2 - .. ipython:: python mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean") diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 60353dde5683f..91cd7315f7213 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -335,7 +335,6 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``'zstd' create a reproducible gzip archive: ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. - .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. .. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to ``gzip.open``. thousands : str, default ``None`` Thousands separator. @@ -3890,8 +3889,6 @@ The :func:`~pandas.read_excel` method can read OpenDocument spreadsheets # Returns a DataFrame pd.read_excel("path_to_file.ods", engine="odf") -.. versionadded:: 1.1.0 - Similarly, the :func:`~pandas.to_excel` method can write OpenDocument spreadsheets .. code-block:: python diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 237ea1a4dd9c6..8d0f1048f6e77 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -298,8 +298,6 @@ For instance, When transforming a DataFrame using :func:`~pandas.melt`, the index will be ignored. The original index values can be kept around by setting the ``ignore_index`` parameter to ``False`` (default is ``True``). This will however duplicate them. -.. versionadded:: 1.1.0 - .. ipython:: python index = pd.MultiIndex.from_tuples([("person", "A"), ("person", "B")]) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 4e0b18c73ee29..c193df5118926 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -62,8 +62,6 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s.astype("string") -.. versionchanged:: 1.1.0 - You can also use :class:`StringDtype`/``"string"`` as the dtype on non-string data and it will be converted to ``string`` dtype: @@ -666,8 +664,6 @@ Or whether elements match a pattern: dtype="string", ).str.match(pattern) -.. versionadded:: 1.1.0 - .. ipython:: python pd.Series( diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0b73a7aea8b10..3519ac2d64f71 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -822,8 +822,6 @@ Furthermore, if you have a ``Series`` with datetimelike values, then you can access these properties via the ``.dt`` accessor, as detailed in the section on :ref:`.dt accessors`. -.. versionadded:: 1.1.0 - You may obtain the year, week and day components of the ISO year from the ISO 8601 standard: .. ipython:: python @@ -1870,8 +1868,6 @@ See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more. Use ``origin`` or ``offset`` to adjust the start of the bins ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 1.1.0 - The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like ``30D``) or that divide a day evenly (like ``90s`` or ``1min``). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed Timestamp with the argument ``origin``. For example: @@ -2117,8 +2113,6 @@ PeriodIndex partial string indexing PeriodIndex now supports partial string slicing with non-monotonic indexes. -.. versionadded:: 1.1.0 - You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodIndex``, in the same manner as ``DatetimeIndex``. For details, refer to :ref:`DatetimeIndex Partial String Indexing `. .. ipython:: python @@ -2491,8 +2485,6 @@ To remove time zone information, use ``tz_localize(None)`` or ``tz_convert(None) Fold ~~~~ -.. versionadded:: 1.1.0 - For ambiguous times, pandas supports explicitly specifying the keyword-only fold argument. Due to daylight saving time, one wall clock time can occur twice when shifting from summer to winter time; fold describes whether the datetime-like corresponds diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index a7d7a09a6bcc5..1f19894479348 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1209,8 +1209,6 @@ shown by default. Controlling the labels ~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 1.1.0 - You may set the ``xlabel`` and ``ylabel`` arguments to give the plot custom labels for x and y axis. By default, pandas will pick up index name as xlabel, while leaving it empty for ylabel. diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index 99e57cacca05a..72c631631ab62 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -237,8 +237,6 @@ from present information back to past information. This allows the rolling windo Custom window rolling ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 1.0 - In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds. The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns @@ -358,8 +356,6 @@ the windows are cast as :class:`Series` objects (``raw=False``) or ndarray objec Numba engine ~~~~~~~~~~~~ -.. versionadded:: 1.0 - Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying ``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). @@ -593,8 +589,6 @@ and **alpha** to the EW functions: one half. * **Alpha** specifies the smoothing factor directly. -.. versionadded:: 1.1.0 - You can also specify ``halflife`` in terms of a timedelta convertible unit to specify the amount of time it takes for an observation to decay to half its value when also specifying a sequence of ``times``. diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 19aa4e173215e..2f3bb566cbcb0 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -59,12 +59,8 @@ cpdef assert_almost_equal(a, b, b : object rtol : float, default 1e-5 Relative tolerance. - - .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. - - .. versionadded:: 1.1.0 check_dtype: bool, default True check dtype if both a and b are np.ndarray. obj : str, default None @@ -80,8 +76,6 @@ cpdef assert_almost_equal(a, b, Specify shared index values of objects being compared, internally used to show appropriate assertion message. - .. versionadded:: 1.1.0 - """ cdef: double diff = 0.0 diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 3e1554e8b79b3..27ff719b1a143 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1302,8 +1302,6 @@ class Timestamp(_Timestamp): datetime-like corresponds to the first (0) or the second time (1) the wall clock hits the ambiguous time. - .. versionadded:: 1.1.0 - Notes ----- There are essentially three calling conventions for the constructor. The diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index c0d1f1eba9e09..02bd91d90362c 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -85,12 +85,8 @@ def assert_almost_equal( equivalent when doing type checking. rtol : float, default 1e-5 Relative tolerance. - - .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. - - .. versionadded:: 1.1.0 """ if isinstance(left, Index): assert_index_equal( @@ -217,12 +213,8 @@ def assert_index_equal( .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. - - .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. - - .. versionadded:: 1.1.0 obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. @@ -711,12 +703,8 @@ def assert_extension_array_equal( Whether to compare number exactly. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. - - .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. - - .. versionadded:: 1.1.0 obj : str, default 'ExtensionArray' Specify object name being compared, internally used to show appropriate assertion message. @@ -842,12 +830,8 @@ def assert_series_equal( Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals. - - .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. - - .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. @@ -855,12 +839,8 @@ def assert_series_equal( rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. - - .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. - - .. versionadded:: 1.1.0 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. @@ -1111,18 +1091,12 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. - - .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. - - .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. - - .. versionadded:: 1.1.0 obj : str, default 'DataFrame' Specify object name being compared, internally used to show appropriate assertion message. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b14a54a872a69..55465148852cc 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1350,8 +1350,6 @@ def isocalendar(self) -> DataFrame: """ Calculate year, week, and day according to the ISO 8601 standard. - .. versionadded:: 1.1.0 - Returns ------- DataFrame diff --git a/pandas/core/common.py b/pandas/core/common.py index 6b7a0214925df..da99b72d60302 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -439,11 +439,6 @@ def random_state(state: RandomState | None = None): If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. - .. versionchanged:: 1.1.0 - - array-like and BitGenerator object now passed to np.random.RandomState() - as seed - Default None. Returns diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 838a34adeaf82..9bfc2e868e03b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1801,8 +1801,6 @@ def to_numpy( The value to use for missing values. The default value depends on `dtype` and the dtypes of the DataFrame columns. - .. versionadded:: 1.1.0 - Returns ------- numpy.ndarray @@ -2612,8 +2610,6 @@ def to_stata( 8 characters and values are repeated. {compression_options} - .. versionadded:: 1.1.0 - .. versionchanged:: 1.4.0 Zstandard support. {storage_options} @@ -2709,8 +2705,6 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: This includes the `compression`, `compression_level`, `chunksize` and `version` keywords. - .. versionadded:: 1.1.0 - Notes ----- This function writes the dataframe as a `feather file @@ -3109,8 +3103,6 @@ def to_html( Convert URLs to HTML links. encoding : str, default "utf-8" Set character encoding. - - .. versionadded:: 1.0 %(returns)s See Also -------- @@ -6596,8 +6588,6 @@ def sort_values( ``Series`` and return a Series with the same shape as the input. It will be applied to each column in `by` independently. - .. versionadded:: 1.1.0 - Returns ------- DataFrame or None @@ -6885,8 +6875,6 @@ def sort_index( ``Index`` and return an ``Index`` of the same shape. For MultiIndex inputs, the key is applied *per level*. - .. versionadded:: 1.1.0 - Returns ------- DataFrame or None @@ -6955,8 +6943,6 @@ def value_counts( """ Return a Series containing counts of unique rows in the DataFrame. - .. versionadded:: 1.1.0 - Parameters ---------- subset : label or list of labels, optional @@ -8677,16 +8663,8 @@ def groupby( ----------%s columns : str or object or a list of str Column to use to make new frame's columns. - - .. versionchanged:: 1.1.0 - Also accept list of columns names. - index : str or object or a list of str, optional Column to use to make new frame's index. If not given, uses existing index. - - .. versionchanged:: 1.1.0 - Also accept list of index names. - values : str, object or a list of the previous, optional Column(s) to use for populating new frame's values. If not specified, all remaining columns will be used and the result will @@ -9188,8 +9166,6 @@ def explode( ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. - .. versionadded:: 1.1.0 - Returns ------- DataFrame @@ -10514,8 +10490,6 @@ def cov( is ``N - ddof``, where ``N`` represents the number of elements. This argument is applicable only when no ``nan`` is in the dataframe. - .. versionadded:: 1.1.0 - numeric_only : bool, default False Include only `float`, `int` or `boolean` data. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b6ade0728e075..2ee1e0512de74 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3680,14 +3680,10 @@ def to_csv( is a non-binary file object. {compression_options} - .. versionchanged:: 1.0.0 - - May now be a dict with key 'method' as compression mode + May be a dict with key 'method' as compression mode and other entries as additional compression options if compression mode is 'zip'. - .. versionchanged:: 1.1.0 - Passing compression options as keys in dict is supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'. @@ -3734,8 +3730,6 @@ def to_csv( See the errors argument for :func:`open` for a full list of options. - .. versionadded:: 1.1.0 - {storage_options} .. versionadded:: 1.2.0 @@ -4940,8 +4934,6 @@ def sort_values( ``Series`` and return a Series with the same shape as the input. It will be applied to each column in `by` independently. - .. versionadded:: 1.1.0 - Returns ------- DataFrame or None @@ -5809,11 +5801,6 @@ def sample( If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. - .. versionchanged:: 1.1.0 - - array-like and BitGenerator object now passed to np.random.RandomState() - as seed - .. versionchanged:: 1.4.0 np.random.Generator objects now accepted @@ -7598,11 +7585,10 @@ def interpolate( * If 'method' is 'backfill' or 'bfill', the default is 'backward' * else the default is 'forward' - .. versionchanged:: 1.1.0 - raises ValueError if `limit_direction` is 'forward' or 'both' and - method is 'backfill' or 'bfill'. - raises ValueError if `limit_direction` is 'backward' or 'both' and - method is 'pad' or 'ffill'. + raises ValueError if `limit_direction` is 'forward' or 'both' and + method is 'backfill' or 'bfill'. + raises ValueError if `limit_direction` is 'backward' or 'both' and + method is 'pad' or 'ffill'. limit_area : {{`None`, 'inside', 'outside'}}, default None If limit is specified, consecutive NaNs will be filled with this @@ -8693,8 +8679,6 @@ def resample( - 'start': `origin` is the first value of the timeseries - 'start_day': `origin` is the first day at midnight of the timeseries - .. versionadded:: 1.1.0 - - 'end': `origin` is the last value of the timeseries - 'end_day': `origin` is the ceiling midnight of the last day @@ -8703,8 +8687,6 @@ def resample( offset : Timedelta or str, default is None An offset timedelta added to the origin. - .. versionadded:: 1.1.0 - group_keys : bool, default False Whether to include the group keys in the result index when using ``.apply()`` on the resampled object. @@ -10275,8 +10257,6 @@ def shift( For datetime, timedelta, or period data, etc. :attr:`NaT` is used. For extension dtypes, ``self.dtype.na_value`` is used. - .. versionchanged:: 1.1.0 - Returns ------- {klass} diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 50b39ee977ed4..3ee05b8c421dc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -417,8 +417,6 @@ class providing the base-class of operations. If a string is chosen, then it needs to be the name of the groupby method you want to use. - - .. versionchanged:: 1.1.0 *args Positional arguments to pass to func. engine : str, default None @@ -426,7 +424,6 @@ class providing the base-class of operations. * ``'numba'`` : Runs the function through JIT compiled code from numba. * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba`` - .. versionadded:: 1.1.0 engine_kwargs : dict, default None * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` @@ -435,7 +432,6 @@ class providing the base-class of operations. ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be applied to the function - .. versionadded:: 1.1.0 **kwargs Keyword arguments to be passed into func. @@ -508,17 +504,15 @@ class providing the base-class of operations. column is keyword, whereas the value determines the aggregation used to compute the values in the column. - .. versionchanged:: 1.1.0 - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. .. deprecated:: 2.1.0 @@ -531,7 +525,6 @@ class providing the base-class of operations. * ``'numba'`` : Runs the function through JIT compiled code from numba. * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - .. versionadded:: 1.1.0 engine_kwargs : dict, default None * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` @@ -540,7 +533,6 @@ class providing the base-class of operations. ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be applied to the function - .. versionadded:: 1.1.0 **kwargs * If ``func`` is None, ``**kwargs`` are used to define the output names and aggregations via Named Aggregation. See ``func`` entry. @@ -595,17 +587,15 @@ class providing the base-class of operations. column is keyword, whereas the value determines the aggregation used to compute the values in the column. - .. versionchanged:: 1.1.0 + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. *args Positional arguments to pass to func. @@ -614,7 +604,6 @@ class providing the base-class of operations. * ``'numba'`` : Runs the function through JIT compiled code from numba. * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - .. versionadded:: 1.1.0 engine_kwargs : dict, default None * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` @@ -623,7 +612,6 @@ class providing the base-class of operations. ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be applied to the function - .. versionadded:: 1.1.0 **kwargs * If ``func`` is None, ``**kwargs`` are used to define the output names and aggregations via Named Aggregation. See ``func`` entry. @@ -4170,8 +4158,6 @@ def sample( You can use `random_state` for reproducibility. - .. versionadded:: 1.1.0 - Parameters ---------- n : int, optional diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 13f43c1bf64a3..e9833a41e2795 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -99,8 +99,6 @@ class Grouper: - 'start': `origin` is the first value of the timeseries - 'start_day': `origin` is the first day at midnight of the timeseries - .. versionadded:: 1.1.0 - - 'end': `origin` is the last value of the timeseries - 'end_day': `origin` is the ceiling midnight of the last day @@ -109,8 +107,6 @@ class Grouper: offset : Timedelta or str, default is None An offset timedelta added to the origin. - .. versionadded:: 1.1.0 - dropna : bool, default True If True, and if group keys contain NA values, NA values together with row/column will be dropped. If False, NA values will also be treated as diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index f86728ad8b686..3ddc8aaf02d97 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -369,8 +369,6 @@ def isocalendar(self) -> DataFrame: """ Calculate year, week, and day according to the ISO 8601 standard. - .. versionadded:: 1.1.0 - Returns ------- DataFrame diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8c557c2824ed4..0f113fae81406 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5613,8 +5613,6 @@ def sort_values( this `key` function should be *vectorized*. It should expect an ``Index`` and return an ``Index`` of the same shape. - .. versionadded:: 1.1.0 - Returns ------- sorted_index : pandas.Index diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 7a39891f74523..bde8a96e0daf3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -908,7 +908,6 @@ def interpolate( * If 'method' is 'backfill' or 'bfill', the default is 'backward' * else the default is 'forward' - .. versionchanged:: 1.1.0 raises ValueError if `limit_direction` is 'forward' or 'both' and method is 'backfill' or 'bfill'. raises ValueError if `limit_direction` is 'backward' or 'both' and diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 00d5059440536..357353ed38d46 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -114,8 +114,6 @@ def cut( the resulting categorical will be ordered. If False, the resulting categorical will be unordered (labels must be provided). - .. versionadded:: 1.1.0 - Returns ------- out : Categorical, Series, or ndarray diff --git a/pandas/core/series.py b/pandas/core/series.py index 96971736f113b..9693981cc5422 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1757,7 +1757,6 @@ def to_markdown( index : bool, optional, default True Add index (row) labels. - .. versionadded:: 1.1.0 {storage_options} .. versionadded:: 1.2.0 @@ -2721,8 +2720,6 @@ def cov( Delta degrees of freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - .. versionadded:: 1.1.0 - Returns ------- float @@ -3396,8 +3393,6 @@ def sort_values( this `key` function should be *vectorized*. It should expect a ``Series`` and return an array-like. - .. versionadded:: 1.1.0 - Returns ------- Series or None @@ -3657,8 +3652,6 @@ def sort_index( this `key` function should be *vectorized*. It should expect an ``Index`` and return an ``Index`` of the same shape. - .. versionadded:: 1.1.0 - Returns ------- Series or None @@ -4118,8 +4111,6 @@ def explode(self, ignore_index: bool = False) -> Series: ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. - .. versionadded:: 1.1.0 - Returns ------- Series diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 0e37ec5dc45f0..6281475b6926f 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -60,8 +60,6 @@ ] = """ Compare to another {klass} and show the differences. -.. versionadded:: 1.1.0 - Parameters ---------- other : {klass} @@ -175,8 +173,6 @@ with row/column will be dropped. If False, NA values will also be treated as the key in groups. - .. versionadded:: 1.1.0 - Returns ------- pandas.api.typing.%(klass)sGroupBy @@ -224,8 +220,6 @@ If True, original index is ignored. If False, the original index is retained. Index labels will be repeated as necessary. - .. versionadded:: 1.1.0 - Returns ------- DataFrame diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index ef9c685e618fb..6049002e3673d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1300,8 +1300,6 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): """ Determine if each string entirely matches a regular expression. - .. versionadded:: 1.1.0 - Parameters ---------- pat : str diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 2e91927a002bb..ad366e58c2f06 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -116,10 +116,7 @@ def to_timedelta( * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' - .. versionchanged:: 1.1.0 - - Must not be specified when `arg` context strings and - ``errors="raise"``. + Must not be specified when `arg` context strings and ``errors="raise"``. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 295a9ecac7fb8..42123fafd62aa 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -154,8 +154,6 @@ class ExponentialMovingWindow(BaseWindow): observation decays to half its value. Only applicable to ``mean()``, and halflife value will not apply to the other functions. - .. versionadded:: 1.1.0 - alpha : float, optional Specify smoothing factor :math:`\alpha` directly @@ -209,8 +207,6 @@ class ExponentialMovingWindow(BaseWindow): times : np.ndarray, Series, default None - .. versionadded:: 1.1.0 - Only applicable to ``mean()``. Times corresponding to the observations. Must be monotonically increasing and diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index c26b478338a55..d4bcc9cee7155 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -234,8 +234,6 @@ class DuplicateLabelError(ValueError): class InvalidIndexError(Exception): """ Exception raised when attempting to use an invalid index key. - - .. versionadded:: 1.1.0 """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 02de416e5ce37..7cdb50c629d21 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -670,13 +670,11 @@ def get_handle( Encoding to use. {compression_options} - .. versionchanged:: 1.0.0 - May now be a dict with key 'method' as compression mode + May be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. - .. versionchanged:: 1.1.0 - Passing compression options as keys in dict is now + Passing compression options as keys in dict is supported for compression modes 'gzip', 'bz2', 'zstd' and 'zip'. .. versionchanged:: 1.4.0 Zstandard support. diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 4d17173fa0ceb..28df235084cf5 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -49,7 +49,6 @@ def to_feather( **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. - .. versionadded:: 1.1.0 """ import_optional_dependency("pyarrow") from pyarrow import feather diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 8ddf14175a6da..181b623cd52a6 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -3053,8 +3053,6 @@ def highlight_null( %(subset)s - .. versionadded:: 1.1.0 - %(props)s .. versionadded:: 1.3.0 diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 286d2b187c700..597369bb12703 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -134,7 +134,6 @@ def read_gbq( If set, limit the maximum number of rows to fetch from the query results. - .. versionadded:: 1.1.0 progress_bar_type : Optional, str If set, use the `tqdm `__ library to display a progress bar while the data downloads. Install the diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index f1f44a71b9a3b..8b2a02f0ac63a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -362,9 +362,9 @@ Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : - - 'error', raise an Exception when a bad line is encountered. - - 'warn', raise a warning when a bad line is encountered and skip that line. - - 'skip', skip bad lines without raising or warning when they are encountered. + - 'error', raise an Exception when a bad line is encountered. + - 'warn', raise a warning when a bad line is encountered and skip that line. + - 'skip', skip bad lines without raising or warning when they are encountered. .. versionadded:: 1.3.0 diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8de1aaacaf400..33ff24f5fc981 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -654,8 +654,6 @@ def keys(self, include: str = "pandas") -> list[str]: When kind equals 'pandas' return pandas objects. When kind equals 'native' return native HDF5 Table objects. - .. versionadded:: 1.1.0 - Returns ------- list @@ -1109,8 +1107,6 @@ def put( independent on creation time. dropna : bool, default False, optional Remove missing values. - - .. versionadded:: 1.1.0 """ if format is None: format = get_option("io.hdf.default_format") or "fixed" diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3dddce98b35be..0492ba22dcf8a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2266,8 +2266,6 @@ class StataWriter(StataParser): Each label must be 80 characters or smaller. {compression_options} - .. versionadded:: 1.1.0 - .. versionchanged:: 1.4.0 Zstandard support. {storage_options} @@ -3195,8 +3193,6 @@ class StataWriter117(StataWriter): characters, and either frequently repeated or sparse. {compression_options} - .. versionadded:: 1.1.0 - .. versionchanged:: 1.4.0 Zstandard support. value_labels : dict of dicts @@ -3587,8 +3583,6 @@ class StataWriterUTF8(StataWriter117): for storing larger DataFrames. {compression_options} - .. versionadded:: 1.1.0 - .. versionchanged:: 1.4.0 Zstandard support. value_labels : dict of dicts diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 8dd3e4385a383..38e1be302b054 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -87,8 +87,6 @@ def hist_series( legend : bool, default False Whether to show the legend. - .. versionadded:: 1.1.0 - **kwargs To be passed to the actual plotting function. @@ -197,8 +195,6 @@ def hist_frame( legend : bool, default False Whether to show the legend. - .. versionadded:: 1.1.0 - **kwargs All other plotting keyword arguments to be passed to :meth:`matplotlib.pyplot.hist`. @@ -448,8 +444,6 @@ def hist_frame( `b`, then passing {'a': 'green', 'b': 'red'} will color %(kind)ss for column `a` in green and %(kind)ss for column `b` in red. - .. versionadded:: 1.1.0 - **kwargs Additional keyword arguments are documented in :meth:`DataFrame.plot`. @@ -705,8 +699,6 @@ class PlotAccessor(PandasObject): Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the x-column name for planar plots. - .. versionadded:: 1.1.0 - .. versionchanged:: 1.2.0 Now applicable to planar plots (`scatter`, `hexbin`). @@ -719,8 +711,6 @@ class PlotAccessor(PandasObject): Name to use for the ylabel on y-axis. Default will show no ylabel, or the y-column name for planar plots. - .. versionadded:: 1.1.0 - .. versionchanged:: 1.2.0 Now applicable to planar plots (`scatter`, `hexbin`). @@ -1637,8 +1627,6 @@ def scatter( recursively. For instance, when passing [2,14] all points size will be either 2 or 14, alternatively. - .. versionchanged:: 1.1.0 - c : str, int or array-like, optional The color of each point. Possible values are: From d51db43914579ed0ce0896883b2fa33951189c1b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 26 Apr 2023 11:53:48 -0400 Subject: [PATCH 061/577] TST/BUG: pyarrow test fixtures upcasting dtypes (#52929) fix arrow test fixtures upcasting --- pandas/tests/extension/test_arrow.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0300b271acc3f..e9a9dbbc8d965 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -130,7 +130,7 @@ def data(dtype): @pytest.fixture def data_missing(data): """Length-2 array with [NA, Valid]""" - return type(data)._from_sequence([None, data[0]]) + return type(data)._from_sequence([None, data[0]], dtype=data.dtype) @pytest.fixture(params=["data", "data_missing"]) @@ -213,7 +213,8 @@ def data_for_sorting(data_for_grouping): A < B < C """ return type(data_for_grouping)._from_sequence( - [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]] + [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]], + dtype=data_for_grouping.dtype, ) @@ -226,7 +227,8 @@ def data_missing_for_sorting(data_for_grouping): A < B and NA missing. """ return type(data_for_grouping)._from_sequence( - [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]] + [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]], + dtype=data_for_grouping.dtype, ) From 44b6cc7a7d3cea6609050194d6a86dabb9e2425e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 26 Apr 2023 17:55:50 +0200 Subject: [PATCH 062/577] PERF/CLN: let pyarrow concat chunks instead of doing it ourselves in __from_arrow__ (#52928) * PERF: let pyarrow concat chunks instead of doing it ourselves in __from_arrow__ * workaround for empty chunked arrays for older pyarrow --- pandas/core/arrays/numeric.py | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index df16419ea2bf1..da55c4193c162 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -88,27 +88,16 @@ def __from_arrow__( array = array.cast(pyarrow_type) - if isinstance(array, pyarrow.Array): - chunks = [array] - else: - # pyarrow.ChunkedArray - chunks = array.chunks - - results = [] - for arr in chunks: - data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.numpy_dtype) - num_arr = array_class(data.copy(), ~mask, copy=False) - results.append(num_arr) - - if not results: - return array_class( - np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_) - ) - elif len(results) == 1: - # avoid additional copy in _concat_same_type - return results[0] - else: - return array_class._concat_same_type(results) + if isinstance(array, pyarrow.ChunkedArray): + # TODO this "if" can be removed when requiring pyarrow >= 10.0, which fixed + # combine_chunks for empty arrays https://github.com/apache/arrow/pull/13757 + if array.num_chunks == 0: + array = pyarrow.array([], type=array.type) + else: + array = array.combine_chunks() + + data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype) + return array_class(data.copy(), ~mask, copy=False) @classmethod def _str_to_dtype_mapping(cls) -> Mapping[str, NumericDtype]: From 7a3a6c4a19c577e5dc07d0f04b5e1699099ce794 Mon Sep 17 00:00:00 2001 From: Parthi Date: Wed, 26 Apr 2023 21:29:43 +0530 Subject: [PATCH 063/577] TST: add test case to verify groupby-apply producing consistent output (#52926) --- pandas/tests/groupby/test_apply.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e5599d60b4f0d..ac5bcb81e7307 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1485,3 +1485,23 @@ def test_empty_df(method, op): ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "group_col", + [([0.0, np.nan, 0.0, 0.0]), ([np.nan, 0.0, 0.0, 0.0]), ([0, 0.0, 0.0, np.nan])], +) +def test_apply_inconsistent_output(group_col): + # GH 34478 + df = DataFrame({"group_col": group_col, "value_col": [2, 2, 2, 2]}) + + result = df.groupby("group_col").value_col.apply( + lambda x: x.value_counts().reindex(index=[1, 2, 3]) + ) + expected = Series( + [np.nan, 3.0, np.nan], + name="value_col", + index=MultiIndex.from_product([[0.0], [1, 2, 3]], names=["group_col", 0.0]), + ) + + tm.assert_series_equal(result, expected) From f8045b4a13879df50659c7c19e7a006efae1b11c Mon Sep 17 00:00:00 2001 From: Jiawei Zhang Date: Wed, 26 Apr 2023 12:58:53 -0400 Subject: [PATCH 064/577] Metadata propagation both in DataFrame.corrwith and Dataframe.sum (#52923) Fix the metadata propagation both in DataFrame.corrwith and Dataframe.sum --- pandas/core/frame.py | 3 ++- pandas/tests/generic/test_finalize.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9bfc2e868e03b..04c1b18cb1af1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10979,7 +10979,8 @@ def sum( min_count: int = 0, **kwargs, ): - return super().sum(axis, skipna, numeric_only, min_count, **kwargs) + result = super().sum(axis, skipna, numeric_only, min_count, **kwargs) + return result.__finalize__(self, method="sum") @doc(make_doc("prod", ndim=2)) def prod( diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index a76b6b94d719d..d6c4dff055748 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -177,7 +177,6 @@ frame_data, operator.methodcaller("corrwith", pd.DataFrame(*frame_data)), ), - marks=not_implemented_mark, ), pytest.param( (pd.DataFrame, frame_data, operator.methodcaller("count")), @@ -414,7 +413,6 @@ ), pytest.param( (pd.DataFrame, frame_data, operator.methodcaller("sum")), - marks=not_implemented_mark, ), pytest.param( (pd.DataFrame, frame_data, operator.methodcaller("std")), From d0e2524dbb82cd0976671d62f5f86ae376f29ecb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 27 Apr 2023 00:36:11 +0200 Subject: [PATCH 065/577] Remove compat for Python lower than 3.9 (#52935) * Remove compat for Python lower than 3.9 * Remove wheel build * Fix mypy --- .circleci/config.yml | 2 +- pandas/compat/__init__.py | 2 -- pandas/compat/_constants.py | 2 -- pandas/conftest.py | 28 ++++++++----------- pandas/core/computation/expr.py | 4 --- .../scalar/timestamp/test_constructors.py | 25 ++++++----------- 6 files changed, 22 insertions(+), 41 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e7322e748662f..549a6374246a0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -92,4 +92,4 @@ workflows: only: /^v.*/ matrix: parameters: - cibw-build: ["cp38-manylinux_aarch64", "cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 745b20dc4e764..3d7589bf67ee2 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -16,7 +16,6 @@ from pandas.compat._constants import ( IS64, - PY39, PY310, PY311, PYPY, @@ -161,7 +160,6 @@ def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: "pa_version_under9p0", "pa_version_under11p0", "IS64", - "PY39", "PY310", "PY311", "PYPY", diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py index 75d99f5ae51fb..1d522a5b4cd09 100644 --- a/pandas/compat/_constants.py +++ b/pandas/compat/_constants.py @@ -12,7 +12,6 @@ IS64 = sys.maxsize > 2**32 -PY39 = sys.version_info >= (3, 9) PY310 = sys.version_info >= (3, 10) PY311 = sys.version_info >= (3, 11) PYPY = platform.python_implementation() == "PyPy" @@ -20,7 +19,6 @@ __all__ = [ "IS64", - "PY39", "PY310", "PY311", "PYPY", diff --git a/pandas/conftest.py b/pandas/conftest.py index 9ede9e65a6839..86f0121dd00a9 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -64,7 +64,6 @@ Series, Timedelta, Timestamp, - compat, ) import pandas._testing as tm from pandas.core import ops @@ -81,18 +80,12 @@ del pa has_pyarrow = True -zoneinfo = None -if compat.PY39: - # Import "zoneinfo" could not be resolved (reportMissingImports) - import zoneinfo # type: ignore[assignment] +import zoneinfo - # Although zoneinfo can be imported in Py39, it is effectively - # "not available" without tzdata/IANA tz data. - # We will set zoneinfo to not found in this case - try: - zoneinfo.ZoneInfo("UTC") # type: ignore[attr-defined] - except zoneinfo.ZoneInfoNotFoundError: # type: ignore[attr-defined] - zoneinfo = None +try: + zoneinfo.ZoneInfo("UTC") +except zoneinfo.ZoneInfoNotFoundError: + zoneinfo = None # type: ignore[assignment] # ---------------------------------------------------------------- @@ -1221,7 +1214,12 @@ def iris(datapath) -> DataFrame: timezone(timedelta(hours=-1), name="foo"), ] if zoneinfo is not None: - TIMEZONES.extend([zoneinfo.ZoneInfo("US/Pacific"), zoneinfo.ZoneInfo("UTC")]) + TIMEZONES.extend( + [ + zoneinfo.ZoneInfo("US/Pacific"), # type: ignore[list-item] + zoneinfo.ZoneInfo("UTC"), # type: ignore[list-item] + ] + ) TIMEZONE_IDS = [repr(i) for i in TIMEZONES] @@ -1964,9 +1962,7 @@ def using_copy_on_write() -> bool: warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] if zoneinfo is not None: - warsaws.append( - zoneinfo.ZoneInfo("Europe/Warsaw") # pyright: ignore[reportGeneralTypeIssues] - ) + warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] @pytest.fixture(params=warsaws) diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 75e8b30d2e1f5..8436126232cf9 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -17,7 +17,6 @@ import numpy as np -from pandas.compat import PY39 from pandas.errors import UndefinedVariableError import pandas.core.common as com @@ -208,9 +207,6 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) -if not PY39: - _slice_nodes = _filter_nodes(ast.slice) - # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 4851612392e68..ceadf7a280a1b 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -5,6 +5,7 @@ timedelta, timezone, ) +import zoneinfo import dateutil.tz from dateutil.tz import tzutc @@ -13,10 +14,7 @@ import pytz from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas.compat import ( - PY39, - PY310, -) +from pandas.compat import PY310 from pandas.errors import OutOfBoundsDatetime from pandas import ( @@ -25,9 +23,6 @@ Timestamp, ) -if PY39: - import zoneinfo - class TestTimestampConstructors: def test_construct_from_string_invalid_raises(self): @@ -845,15 +840,13 @@ def test_timestamp_constructor_retain_fold(tz, fold): assert result == expected -_tzs = ["dateutil/Europe/London"] -if PY39: - try: - _tzs = [ - "dateutil/Europe/London", - zoneinfo.ZoneInfo("Europe/London"), # type: ignore[list-item] - ] - except zoneinfo.ZoneInfoNotFoundError: - pass +try: + _tzs = [ + "dateutil/Europe/London", + zoneinfo.ZoneInfo("Europe/London"), + ] +except zoneinfo.ZoneInfoNotFoundError: + _tzs = ["dateutil/Europe/London"] @pytest.mark.parametrize("tz", _tzs) From 471dcd1f57cae18bced4feaa9d2c0ecb49ff4d6c Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Thu, 27 Apr 2023 00:37:25 +0200 Subject: [PATCH 066/577] DOC: add examples to offsets.YearEnd (#52942) add examples to offsets.YearEnd --- pandas/_libs/tslibs/offsets.pyx | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0614cb2a9d8c9..31acf0ef1bbe4 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2291,13 +2291,29 @@ cdef class BYearBegin(YearOffset): cdef class YearEnd(YearOffset): """ - DateOffset increments between calendar year ends. + DateOffset increments between calendar year end dates. + + YearEnd goes to the next date which is the end of the year. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) >>> ts + pd.offsets.YearEnd() Timestamp('2022-12-31 00:00:00') + + >>> ts = pd.Timestamp(2022, 12, 31) + >>> ts + pd.offsets.YearEnd() + Timestamp('2023-12-31 00:00:00') + + If you want to get the end of the current year: + + >>> ts = pd.Timestamp(2022, 12, 31) + >>> pd.offsets.YearEnd().rollback(ts) + Timestamp('2022-12-31 00:00:00') """ _default_month = 12 @@ -2316,9 +2332,9 @@ cdef class YearEnd(YearOffset): cdef class YearBegin(YearOffset): """ - DateOffset of one year at beginning. + DateOffset increments between calendar year begin dates. - YearBegin goes to the next date which is a start of the year. + YearBegin goes to the next date which is the start of the year. See Also -------- From 0cf70eb3d630aabeccc68d1f2e112db7c396b974 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 27 Apr 2023 02:15:00 +0200 Subject: [PATCH 067/577] BUG: pd.array raising with NumPy array and large dtype (#52591) * BUG: pd.array raising with NumPy array and large dtype * Fix * Add gh ref * Move --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/arrays/arrow/array.py | 10 ++++++++++ pandas/tests/extension/test_arrow.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 09932a2d2d571..f6b0b4086cb39 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -22,6 +22,7 @@ Bug fixes ~~~~~~~~~ - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) +- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 55c6b74e495c0..a7f2ef85c2a9d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -245,6 +245,16 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal Construct a new ExtensionArray from a sequence of scalars. """ pa_dtype = to_pyarrow_type(dtype) + if ( + isinstance(scalars, np.ndarray) + and isinstance(dtype, ArrowDtype) + and ( + pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) + ) + ): + # See https://github.com/apache/arrow/issues/35289 + scalars = scalars.tolist() + if isinstance(scalars, cls): scalars = scalars._pa_array elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e9a9dbbc8d965..4caa982fa7b64 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2802,6 +2802,20 @@ def test_setitem_boolean_replace_with_mask_segfault(): assert arr._pa_array == expected._pa_array +@pytest.mark.parametrize( + "data, arrow_dtype", + [ + ([b"a", b"b"], pa.large_binary()), + (["a", "b"], pa.large_string()), + ], +) +def test_conversion_large_dtypes_from_numpy_array(data, arrow_dtype): + dtype = ArrowDtype(arrow_dtype) + result = pd.array(np.array(data), dtype=dtype) + expected = pd.array(data, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES) def test_describe_numeric_data(pa_type): # GH 52470 From 6f8e79d96e3b9e20715ee5c6f27e625be09765c0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 27 Apr 2023 14:27:17 +0200 Subject: [PATCH 068/577] BUG: convert_dtypes ingoring convert keywords for pyarrow backend (#52872) --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/dtypes/cast.py | 37 ++++++++++++------- .../frame/methods/test_convert_dtypes.py | 14 +++++++ 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index f6b0b4086cb39..adfebd857b390 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -22,6 +22,7 @@ Bug fixes ~~~~~~~~~ - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) +- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2d45158cf2a9f..fd8c651fe73dc 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1106,20 +1106,29 @@ def convert_dtypes( from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.arrays.string_ import StringDtype - if isinstance(inferred_dtype, PandasExtensionDtype): - base_dtype = inferred_dtype.base - elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): - base_dtype = inferred_dtype.numpy_dtype - elif isinstance(inferred_dtype, StringDtype): - base_dtype = np.dtype(str) - else: - # error: Incompatible types in assignment (expression has type - # "Union[str, Any, dtype[Any], ExtensionDtype]", - # variable has type "Union[dtype[Any], ExtensionDtype, None]") - base_dtype = inferred_dtype # type: ignore[assignment] - pa_type = to_pyarrow_type(base_dtype) - if pa_type is not None: - inferred_dtype = ArrowDtype(pa_type) + assert not isinstance(inferred_dtype, str) + + if ( + (convert_integer and inferred_dtype.kind in "iu") + or (convert_floating and inferred_dtype.kind in "fc") + or (convert_boolean and inferred_dtype.kind == "b") + or (convert_string and isinstance(inferred_dtype, StringDtype)) + or ( + inferred_dtype.kind not in "iufcb" + and not isinstance(inferred_dtype, StringDtype) + ) + ): + if isinstance(inferred_dtype, PandasExtensionDtype): + base_dtype = inferred_dtype.base + elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): + base_dtype = inferred_dtype.numpy_dtype + elif isinstance(inferred_dtype, StringDtype): + base_dtype = np.dtype(str) + else: + base_dtype = inferred_dtype + pa_type = to_pyarrow_type(base_dtype) + if pa_type is not None: + inferred_dtype = ArrowDtype(pa_type) # error: Incompatible return value type (got "Union[str, Union[dtype[Any], # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]") diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 6076933eecec4..a749cd11df4f7 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -134,3 +134,17 @@ def test_pyarrow_engine_lines_false(self): ) with pytest.raises(ValueError, match=msg): df.convert_dtypes(dtype_backend="numpy") + + def test_pyarrow_backend_no_convesion(self): + # GH#52872 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"}) + expected = df.copy() + result = df.convert_dtypes( + convert_floating=False, + convert_integer=False, + convert_boolean=False, + convert_string=False, + dtype_backend="pyarrow", + ) + tm.assert_frame_equal(result, expected) From 8be69a4dc2db97728e89d812ca2d68ce3495f1f7 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 27 Apr 2023 13:31:44 +0100 Subject: [PATCH 069/577] CLN: Dont upcast where unnecessary (PDEP6 precursor) (#52957) --- doc/source/user_guide/cookbook.rst | 2 +- doc/source/user_guide/missing_data.rst | 2 +- doc/source/whatsnew/v0.15.0.rst | 2 +- doc/source/whatsnew/v0.17.0.rst | 2 +- pandas/_testing/contexts.py | 24 +++++++++++++++--------- pandas/core/generic.py | 8 ++++---- pandas/tests/groupby/test_groupby.py | 3 ++- pandas/tests/groupby/test_nunique.py | 1 + 8 files changed, 26 insertions(+), 18 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 3eee4ce7ac25c..604a5cc56ab04 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -125,7 +125,7 @@ Building criteria .. ipython:: python - df.loc[(df["BBB"] > 25) | (df["CCC"] >= 75), "AAA"] = 0.1 + df.loc[(df["BBB"] > 25) | (df["CCC"] >= 75), "AAA"] = 999 df `Select rows with data closest to certain value using argsort diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 4d645cd75ac76..a17d0eba294b2 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -123,7 +123,7 @@ the missing value type chosen: .. ipython:: python - s = pd.Series([1, 2, 3]) + s = pd.Series([1., 2., 3.]) s.loc[0] = None s diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 67e91751e9527..6b962cbb49c74 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -748,7 +748,7 @@ Other notable API changes: .. ipython:: python - s = pd.Series([1, 2, 3]) + s = pd.Series([1., 2., 3.]) s.loc[0] = None s diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 7067407604d24..abbda2ffc9be2 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -738,7 +738,7 @@ Boolean comparisons of a ``Series`` vs ``None`` will now be equivalent to compar .. ipython:: python - s = pd.Series(range(3)) + s = pd.Series(range(3), dtype="float") s.iloc[1] = None s diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index fb5b7b967f6bf..ab00c80886794 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -205,18 +205,24 @@ def use_numexpr(use, min_elements=None) -> Generator[None, None, None]: set_option("compute.use_numexpr", olduse) -def raises_chained_assignment_error(): - if PYPY: +def raises_chained_assignment_error(extra_warnings=(), extra_match=()): + from pandas._testing import assert_produces_warning + + if PYPY and not extra_warnings: from contextlib import nullcontext return nullcontext() + elif PYPY and extra_warnings: + return assert_produces_warning( + extra_warnings, + match="|".join(extra_match), + ) else: - from pandas._testing import assert_produces_warning - + match = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment" + ) return assert_produces_warning( - ChainedAssignmentError, - match=( - "A value is trying to be set on a copy of a DataFrame or Series " - "through chained assignment" - ), + (ChainedAssignmentError, *extra_warnings), + match="|".join((match, *extra_match)), ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2ee1e0512de74..6b3be257a1b30 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7894,7 +7894,7 @@ def asof(self, where, subset=None): Take all columns into consideration - >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50], + >>> df = pd.DataFrame({'a': [10., 20., 30., 40., 50.], ... 'b': [None, None, None, None, 500]}, ... index=pd.DatetimeIndex(['2018-02-27 09:01:00', ... '2018-02-27 09:02:00', @@ -7912,9 +7912,9 @@ def asof(self, where, subset=None): >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30', ... '2018-02-27 09:04:30']), ... subset=['a']) - a b - 2018-02-27 09:03:30 30 NaN - 2018-02-27 09:04:30 40 NaN + a b + 2018-02-27 09:03:30 30.0 NaN + 2018-02-27 09:04:30 40.0 NaN """ if isinstance(where, str): where = Timestamp(where) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 514c0fe82ff5f..42e3db2c72f26 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -184,7 +184,8 @@ def f_2(grp): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").apply(f_2)[["B"]] - e = expected.copy() + # Explicit cast to float to avoid implicit cast when setting nan + e = expected.copy().astype({"B": "float"}) e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py index 282c91c82f5b1..661003d081bda 100644 --- a/pandas/tests/groupby/test_nunique.py +++ b/pandas/tests/groupby/test_nunique.py @@ -51,6 +51,7 @@ def check_nunique(df, keys, as_index=True): check_nunique(frame, ["jim"]) check_nunique(frame, ["jim", "joe"]) + frame = frame.astype({"julie": float}) # Explicit cast to avoid implicit cast below frame.loc[1::17, "jim"] = None frame.loc[3::37, "joe"] = None frame.loc[7::19, "julie"] = None From 12d323283ed72ce051438a4f30e081c10703802c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 27 Apr 2023 16:32:08 +0200 Subject: [PATCH 070/577] DOC: Clean up for deprecations (#52949) --- pandas/core/indexes/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0f113fae81406..9fc2f89d3fb73 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2545,7 +2545,7 @@ def is_categorical(self) -> bool: Check if the Index holds categorical data. .. deprecated:: 2.0.0 - Use :meth:`pandas.api.types.is_categorical_dtype` instead. + Use `isinstance(index.dtype, pd.CategoricalDtype)` instead. Returns ------- @@ -2598,7 +2598,7 @@ def is_interval(self) -> bool: Check if the Index holds Interval objects. .. deprecated:: 2.0.0 - Use `pandas.api.types.is_interval_dtype` instead. + Use `isinstance(index.dtype, pd.IntervalDtype)` instead. Returns ------- From 66e460bf262cf6ce0f6417dbd306106e471d9727 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 27 Apr 2023 08:54:30 -0700 Subject: [PATCH 071/577] ENH: Support ArrowDtype in interchange Column.dtype (#52792) --- pandas/core/interchange/column.py | 7 +++- pandas/core/interchange/utils.py | 48 +++++++++++++++++++++++++ pandas/tests/interchange/test_utils.py | 49 ++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 7eb43dbd074c9..fea96d861f12c 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -11,6 +11,7 @@ import pandas as pd from pandas.api.types import is_string_dtype +from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.interchange.buffer import PandasBuffer from pandas.core.interchange.dataframe_protocol import ( Column, @@ -134,8 +135,12 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: if kind is None: # Not a NumPy dtype. Check if it's a categorical maybe raise ValueError(f"Data type {dtype} not supported by interchange protocol") + if isinstance(dtype, ArrowDtype): + byteorder = dtype.numpy_dtype.byteorder + else: + byteorder = dtype.byteorder - return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder + return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder @property def describe_categorical(self): diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 69c0367238d7a..e92899583176f 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -13,10 +13,47 @@ from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.arrays.arrow.dtype import ArrowDtype + if typing.TYPE_CHECKING: from pandas._typing import DtypeObj +# Maps str(pyarrow.DataType) = C type format string +# Currently, no pyarrow API for this +PYARROW_CTYPES = { + "null": "n", + "bool": "b", + "uint8": "C", + "uint16": "S", + "uint32": "I", + "uint64": "L", + "int8": "c", + "int16": "S", + "int32": "i", + "int64": "l", + "halffloat": "e", # float16 + "float": "f", # float32 + "double": "g", # float64 + "string": "u", + "binary": "z", + "time32[s]": "tts", + "time32[ms]": "ttm", + "time64[us]": "ttu", + "time64[ns]": "ttn", + "date32[day]": "tdD", + "date64[ms]": "tdm", + "timestamp[s]": "tss:", + "timestamp[ms]": "tsm:", + "timestamp[us]": "tsu:", + "timestamp[ns]": "tsn:", + "duration[s]": "tDs", + "duration[ms]": "tDm", + "duration[us]": "tDu", + "duration[ns]": "tDn", +} + + class ArrowCTypes: """ Enum for Apache Arrow C type format strings. @@ -78,6 +115,17 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: return ArrowCTypes.INT64 elif dtype == np.dtype("O"): return ArrowCTypes.STRING + elif isinstance(dtype, ArrowDtype): + import pyarrow as pa + + pa_type = dtype.pyarrow_dtype + if pa.types.is_decimal(pa_type): + return f"d:{pa_type.precision},{pa_type.scale}" + elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None: + return f"ts{pa_type.unit[0]}:{pa_type.tz}" + format_str = PYARROW_CTYPES.get(str(pa_type), None) + if format_str is not None: + return format_str format_str = getattr(ArrowCTypes, dtype.name.upper(), None) if format_str is not None: diff --git a/pandas/tests/interchange/test_utils.py b/pandas/tests/interchange/test_utils.py index 4fd42abb7f3f1..a47bc2752ff32 100644 --- a/pandas/tests/interchange/test_utils.py +++ b/pandas/tests/interchange/test_utils.py @@ -38,3 +38,52 @@ def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # PR01 """Test ``dtype_to_arrow_c_fmt`` utility function.""" assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string + + +@pytest.mark.parametrize( + "pa_dtype, args_kwargs, c_string", + [ + ["null", {}, "n"], + ["bool_", {}, "b"], + ["uint8", {}, "C"], + ["uint16", {}, "S"], + ["uint32", {}, "I"], + ["uint64", {}, "L"], + ["int8", {}, "c"], + ["int16", {}, "S"], + ["int32", {}, "i"], + ["int64", {}, "l"], + ["float16", {}, "e"], + ["float32", {}, "f"], + ["float64", {}, "g"], + ["string", {}, "u"], + ["binary", {}, "z"], + ["time32", ("s",), "tts"], + ["time32", ("ms",), "ttm"], + ["time64", ("us",), "ttu"], + ["time64", ("ns",), "ttn"], + ["date32", {}, "tdD"], + ["date64", {}, "tdm"], + ["timestamp", {"unit": "s"}, "tss:"], + ["timestamp", {"unit": "ms"}, "tsm:"], + ["timestamp", {"unit": "us"}, "tsu:"], + ["timestamp", {"unit": "ns"}, "tsn:"], + ["timestamp", {"unit": "ns", "tz": "UTC"}, "tsn:UTC"], + ["duration", ("s",), "tDs"], + ["duration", ("ms",), "tDm"], + ["duration", ("us",), "tDu"], + ["duration", ("ns",), "tDn"], + ["decimal128", {"precision": 4, "scale": 2}, "d:4,2"], + ], +) +def test_dtype_to_arrow_c_fmt_arrowdtype(pa_dtype, args_kwargs, c_string): + # GH 52323 + pa = pytest.importorskip("pyarrow") + if not args_kwargs: + pa_type = getattr(pa, pa_dtype)() + elif isinstance(args_kwargs, tuple): + pa_type = getattr(pa, pa_dtype)(*args_kwargs) + else: + pa_type = getattr(pa, pa_dtype)(**args_kwargs) + arrow_type = pd.ArrowDtype(pa_type) + assert dtype_to_arrow_c_fmt(arrow_type) == c_string From 3587e29111b8752572e0357d77ca0930c38479ea Mon Sep 17 00:00:00 2001 From: Nicklaus Roach <106932628+nicklausroach@users.noreply.github.com> Date: Thu, 27 Apr 2023 12:03:59 -0400 Subject: [PATCH 072/577] BUG: overriden methods of subclasses of Styler are not called during rendering #52728 (#52919) * use type() instead of __class__ * add test * remove unnecessary file * typo * revert change to class method --------- Co-authored-by: Nicklaus Roach Co-authored-by: Nicklaus Roach --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/io/formats/style.py | 4 ++-- pandas/tests/io/formats/style/test_style.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b10dd876050ae..07c972e176257 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -419,7 +419,7 @@ ExtensionArray Styler ^^^^^^ -- +- Bug in :meth:`Styler._copy` calling overridden methods in subclasses of :class:`Styler` (:issue:`52728`) - Other diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 181b623cd52a6..185c3f34a8f23 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1579,8 +1579,8 @@ def _copy(self, deepcopy: bool = False) -> Styler: - applied styles (_todo) """ - # GH 40675 - styler = Styler( + # GH 40675, 52728 + styler = type(self)( self.data, # populates attributes 'data', 'columns', 'index' as shallow ) shallow = [ # simple string or boolean immutables diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 0abe4b82e8848..f1eb6ee3c5c62 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -313,6 +313,20 @@ def test_copy(comprehensive, render, deepcopy, mi_styler, mi_styler_comp): assert id(getattr(s2, attr)) != id(getattr(styler, attr)) +@pytest.mark.parametrize("deepcopy", [True, False]) +def test_inherited_copy(mi_styler, deepcopy): + # Ensure that the inherited class is preserved when a Styler object is copied. + # GH 52728 + class CustomStyler(Styler): + pass + + custom_styler = CustomStyler(mi_styler.data) + custom_styler_copy = ( + copy.deepcopy(custom_styler) if deepcopy else copy.copy(custom_styler) + ) + assert isinstance(custom_styler_copy, CustomStyler) + + def test_clear(mi_styler_comp): # NOTE: if this test fails for new features then 'mi_styler_comp' should be updated # to ensure proper testing of the 'copy', 'clear', 'export' methods with new feature From e1df262e2b20b961c51bf217df9b67c0d1317e69 Mon Sep 17 00:00:00 2001 From: liang3zy22 <35164941+liang3zy22@users.noreply.github.com> Date: Fri, 28 Apr 2023 00:05:32 +0800 Subject: [PATCH 073/577] resampler with non_naive index test GH#25411 (#52956) Signed-off-by: Liang Yan --- .../tests/resample/test_resampler_grouper.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index f19b315400f69..209f2e74d97de 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -327,6 +327,25 @@ def test_apply_columns_multilevel(): tm.assert_frame_equal(result, expected) +def test_apply_non_naive_index(): + def weighted_quantile(series, weights, q): + series = series.sort_values() + cumsum = weights.reindex(series.index).fillna(0).cumsum() + cutoff = cumsum.iloc[-1] * q + return series[cumsum >= cutoff].iloc[0] + + times = date_range("2017-6-23 18:00", periods=8, freq="15T", tz="UTC") + data = Series([1.0, 1, 1, 1, 1, 2, 2, 0], index=times) + weights = Series([160.0, 91, 65, 43, 24, 10, 1, 0], index=times) + + result = data.resample("D").apply(weighted_quantile, weights=weights, q=0.5) + ind = date_range( + "2017-06-23 00:00:00+00:00", "2017-06-23 00:00:00+00:00", freq="D", tz="UTC" + ) + expected = Series([1.0], index=ind) + tm.assert_series_equal(result, expected) + + def test_resample_groupby_with_label(): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5) From 9c19fc248a0c04dfc7ec0789fea0ecc889381543 Mon Sep 17 00:00:00 2001 From: JHM Darbyshire <24256554+attack68@users.noreply.github.com> Date: Thu, 27 Apr 2023 18:10:20 +0200 Subject: [PATCH 074/577] REF: Styler.applymap -> map (#52708) * REF: Styler.applymap -> map * update tests to new names with deprecation warnings * whats new * excel tests fixed * Fix the UserGuide style.ipynb * Additional applymap change * add whatsnew back after merge --------- Co-authored-by: JHM Darbyshire (iMac) --- doc/source/reference/style.rst | 4 +- doc/source/user_guide/style.ipynb | 53 ++++--- doc/source/whatsnew/v2.1.0.rst | 2 + pandas/core/generic.py | 2 +- pandas/io/formats/style.py | 131 +++++++++++++----- pandas/io/formats/style_render.py | 4 +- pandas/tests/io/excel/test_openpyxl.py | 2 +- pandas/tests/io/excel/test_style.py | 16 +-- pandas/tests/io/formats/style/test_html.py | 26 ++-- .../tests/io/formats/style/test_non_unique.py | 6 +- pandas/tests/io/formats/style/test_style.py | 32 ++--- .../tests/io/formats/style/test_to_latex.py | 12 +- 12 files changed, 178 insertions(+), 112 deletions(-) diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 5144f12fa373a..2256876c93e01 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -36,9 +36,9 @@ Style application :toctree: api/ Styler.apply - Styler.applymap + Styler.map Styler.apply_index - Styler.applymap_index + Styler.map_index Styler.format Styler.format_index Styler.relabel_index diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 7ae19dfe8021e..79b04ef57d9cf 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -352,15 +351,15 @@ "\n", "- Using [.set_table_styles()][table] to control broader areas of the table with specified internal CSS. Although table styles allow the flexibility to add CSS selectors and properties controlling all individual parts of the table, they are unwieldy for individual cell specifications. Also, note that table styles cannot be exported to Excel. \n", "- Using [.set_td_classes()][td_class] to directly link either external CSS classes to your data cells or link the internal CSS classes created by [.set_table_styles()][table]. See [here](#Setting-Classes-and-Linking-to-External-CSS). These cannot be used on column header rows or indexes, and also won't export to Excel. \n", - "- Using the [.apply()][apply] and [.applymap()][applymap] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes; [.apply_index()][applyindex] and [.applymap_index()][applymapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n", + "- Using the [.apply()][apply] and [.map()][map] functions to add direct internal CSS to specific data cells. See [here](#Styler-Functions). As of v1.4.0 there are also methods that work directly on column header rows or indexes; [.apply_index()][applyindex] and [.map_index()][mapindex]. Note that only these methods add styles that will export to Excel. These methods work in a similar way to [DataFrame.apply()][dfapply] and [DataFrame.map()][dfmap].\n", "\n", "[table]: ../reference/api/pandas.io.formats.style.Styler.set_table_styles.rst\n", "[styler]: ../reference/api/pandas.io.formats.style.Styler.rst\n", "[td_class]: ../reference/api/pandas.io.formats.style.Styler.set_td_classes.rst\n", "[apply]: ../reference/api/pandas.io.formats.style.Styler.apply.rst\n", - "[applymap]: ../reference/api/pandas.io.formats.style.Styler.applymap.rst\n", + "[map]: ../reference/api/pandas.io.formats.style.Styler.map.rst\n", "[applyindex]: ../reference/api/pandas.io.formats.style.Styler.apply_index.rst\n", - "[applymapindex]: ../reference/api/pandas.io.formats.style.Styler.applymap_index.rst\n", + "[mapindex]: ../reference/api/pandas.io.formats.style.Styler.map_index.rst\n", "[dfapply]: ../reference/api/pandas.DataFrame.apply.rst\n", "[dfmap]: ../reference/api/pandas.DataFrame.map.rst" ] @@ -565,13 +564,13 @@ "\n", "We use the following methods to pass your style functions. Both of those methods take a function (and some other keyword arguments) and apply it to the DataFrame in a certain way, rendering CSS styles.\n", "\n", - "- [.applymap()][applymap] (elementwise): accepts a function that takes a single value and returns a string with the CSS attribute-value pair.\n", + "- [.map()][map] (elementwise): accepts a function that takes a single value and returns a string with the CSS attribute-value pair.\n", "- [.apply()][apply] (column-/row-/table-wise): accepts a function that takes a Series or DataFrame and returns a Series, DataFrame, or numpy array with an identical shape where each element is a string with a CSS attribute-value pair. This method passes each column or row of your DataFrame one-at-a-time or the entire table at once, depending on the `axis` keyword argument. For columnwise use `axis=0`, rowwise use `axis=1`, and for the entire table at once use `axis=None`.\n", "\n", "This method is powerful for applying multiple, complex logic to data cells. We create a new DataFrame to demonstrate this.\n", "\n", "[apply]: ../reference/api/pandas.io.formats.style.Styler.apply.rst\n", - "[applymap]: ../reference/api/pandas.io.formats.style.Styler.applymap.rst" + "[map]: ../reference/api/pandas.io.formats.style.Styler.map.rst" ] }, { @@ -589,7 +588,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For example we can build a function that colors text if it is negative, and chain this with a function that partially fades cells of negligible value. Since this looks at each element in turn we use ``applymap``." + "For example we can build a function that colors text if it is negative, and chain this with a function that partially fades cells of negligible value. Since this looks at each element in turn we use ``map``." ] }, { @@ -600,8 +599,8 @@ "source": [ "def style_negative(v, props=''):\n", " return props if v < 0 else None\n", - "s2 = df2.style.applymap(style_negative, props='color:red;')\\\n", - " .applymap(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\n", + "s2 = df2.style.map(style_negative, props='color:red;')\\\n", + " .map(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\n", "s2" ] }, @@ -699,13 +698,13 @@ "\n", "Similar application is achieved for headers by using:\n", " \n", - "- [.applymap_index()][applymapindex] (elementwise): accepts a function that takes a single value and returns a string with the CSS attribute-value pair.\n", + "- [.map_index()][mapindex] (elementwise): accepts a function that takes a single value and returns a string with the CSS attribute-value pair.\n", "- [.apply_index()][applyindex] (level-wise): accepts a function that takes a Series and returns a Series, or numpy array with an identical shape where each element is a string with a CSS attribute-value pair. This method passes each level of your Index one-at-a-time. To style the index use `axis=0` and to style the column headers use `axis=1`.\n", "\n", "You can select a `level` of a `MultiIndex` but currently no similar `subset` application is available for these methods.\n", "\n", "[applyindex]: ../reference/api/pandas.io.formats.style.Styler.apply_index.rst\n", - "[applymapindex]: ../reference/api/pandas.io.formats.style.Styler.applymap_index.rst" + "[mapindex]: ../reference/api/pandas.io.formats.style.Styler.map_index.rst" ] }, { @@ -714,7 +713,7 @@ "metadata": {}, "outputs": [], "source": [ - "s2.applymap_index(lambda v: \"color:pink;\" if v>4 else \"color:darkblue;\", axis=0)\n", + "s2.map_index(lambda v: \"color:pink;\" if v>4 else \"color:darkblue;\", axis=0)\n", "s2.apply_index(lambda s: np.where(s.isin([\"A\", \"B\"]), \"color:pink;\", \"color:darkblue;\"), axis=1)" ] }, @@ -831,7 +830,7 @@ "source": [ "## Finer Control with Slicing\n", "\n", - "The examples we have shown so far for the `Styler.apply` and `Styler.applymap` functions have not demonstrated the use of the ``subset`` argument. This is a useful argument which permits a lot of flexibility: it allows you to apply styles to specific rows or columns, without having to code that logic into your `style` function.\n", + "The examples we have shown so far for the `Styler.apply` and `Styler.map` functions have not demonstrated the use of the ``subset`` argument. This is a useful argument which permits a lot of flexibility: it allows you to apply styles to specific rows or columns, without having to code that logic into your `style` function.\n", "\n", "The value passed to `subset` behaves similar to slicing a DataFrame;\n", "\n", @@ -1034,7 +1033,7 @@ "outputs": [], "source": [ "props = 'font-family: \"Times New Roman\", Times, serif; color: #e83e8c; font-size:1.3em;'\n", - "df4.style.applymap(lambda x: props, subset=[1])" + "df4.style.map(lambda x: props, subset=[1])" ] }, { @@ -1321,7 +1320,7 @@ "source": [ "### Set properties\n", "\n", - "Use `Styler.set_properties` when the style doesn't actually depend on the values. This is just a simple wrapper for `.applymap` where the function returns the same properties for all cells." + "Use `Styler.set_properties` when the style doesn't actually depend on the values. This is just a simple wrapper for `.map` where the function returns the same properties for all cells." ] }, { @@ -1465,8 +1464,8 @@ "outputs": [], "source": [ "style1 = df2.style\\\n", - " .applymap(style_negative, props='color:red;')\\\n", - " .applymap(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\\\n", + " .map(style_negative, props='color:red;')\\\n", + " .map(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\\\n", " .set_table_styles([{\"selector\": \"th\", \"props\": \"color: blue;\"}])\\\n", " .hide(axis=\"index\")\n", "style1" @@ -1683,7 +1682,7 @@ " - `number-format`\n", " - `border-style` (for Excel-specific styles: \"hair\", \"mediumDashDot\", \"dashDotDot\", \"mediumDashDotDot\", \"dashDot\", \"slantDashDot\", or \"mediumDashed\")\n", "\n", - "Table level styles, and data cell CSS-classes are not included in the export to Excel: individual cells must have their properties mapped by the `Styler.apply` and/or `Styler.applymap` methods." + "Table level styles, and data cell CSS-classes are not included in the export to Excel: individual cells must have their properties mapped by the `Styler.apply` and/or `Styler.map` methods." ] }, { @@ -1693,7 +1692,7 @@ "outputs": [], "source": [ "df2.style.\\\n", - " applymap(style_negative, props='color:red;').\\\n", + " map(style_negative, props='color:red;').\\\n", " highlight_max(axis=0).\\\n", " to_excel('styled.xlsx', engine='openpyxl')" ] @@ -1783,8 +1782,8 @@ "outputs": [], "source": [ "df4 = pd.DataFrame([['text']])\n", - "df4.style.applymap(lambda x: 'color:green;')\\\n", - " .applymap(lambda x: 'color:red;')" + "df4.style.map(lambda x: 'color:green;')\\\n", + " .map(lambda x: 'color:red;')" ] }, { @@ -1793,8 +1792,8 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.applymap(lambda x: 'color:red;')\\\n", - " .applymap(lambda x: 'color:green;')" + "df4.style.map(lambda x: 'color:red;')\\\n", + " .map(lambda x: 'color:green;')" ] }, { @@ -1821,7 +1820,7 @@ "source": [ "df4.style.set_uuid('a_')\\\n", " .set_table_styles([{'selector': 'td', 'props': 'color:red;'}])\\\n", - " .applymap(lambda x: 'color:green;')" + " .map(lambda x: 'color:green;')" ] }, { @@ -1840,7 +1839,7 @@ "df4.style.set_uuid('b_')\\\n", " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", " {'selector': '.cls-1', 'props': 'color:blue;'}])\\\n", - " .applymap(lambda x: 'color:green;')\\\n", + " .map(lambda x: 'color:green;')\\\n", " .set_td_classes(pd.DataFrame([['cls-1']]))" ] }, @@ -1861,7 +1860,7 @@ " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", " {'selector': '.cls-1', 'props': 'color:blue;'},\n", " {'selector': 'td.data', 'props': 'color:yellow;'}])\\\n", - " .applymap(lambda x: 'color:green;')\\\n", + " .map(lambda x: 'color:green;')\\\n", " .set_td_classes(pd.DataFrame([['cls-1']]))" ] }, @@ -1884,7 +1883,7 @@ " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", " {'selector': '.cls-1', 'props': 'color:blue;'},\n", " {'selector': 'td.data', 'props': 'color:yellow;'}])\\\n", - " .applymap(lambda x: 'color:green !important;')\\\n", + " .map(lambda x: 'color:green !important;')\\\n", " .set_td_classes(pd.DataFrame([['cls-1']]))" ] }, diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 07c972e176257..7846c48c655a9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -241,6 +241,8 @@ Deprecations - Deprecated :func:`is_interval_dtype`, check ``isinstance(dtype, pd.IntervalDtype)`` instead (:issue:`52607`) - Deprecated :func:`is_period_dtype`, check ``isinstance(dtype, pd.PeriodDtype)`` instead (:issue:`52642`) - Deprecated :func:`is_sparse`, check ``isinstance(dtype, pd.SparseDtype)`` instead (:issue:`52642`) +- Deprecated :meth:`.Styler.applymap_index`. Use the new :meth:`.Styler.map_index` method instead (:issue:`52708`) +- Deprecated :meth:`.Styler.applymap`. Use the new :meth:`.Styler.map` method instead (:issue:`52708`) - Deprecated :meth:`DataFrame.applymap`. Use the new :meth:`DataFrame.map` method instead (:issue:`52353`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6b3be257a1b30..60e8f39548a34 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3543,7 +3543,7 @@ def _to_latex_via_styler( # bold_rows is not a direct kwarg of Styler.to_latex render_kwargs = {} if render_kwargs is None else render_kwargs if render_kwargs.pop("bold_rows"): - styler.applymap_index(lambda v: "textbf:--rwrap;") + styler.map_index(lambda v: "textbf:--rwrap;") return styler.to_latex(buf=buf, **render_kwargs) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 185c3f34a8f23..b2505bc63926c 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -16,6 +16,7 @@ Sequence, overload, ) +import warnings import numpy as np @@ -26,6 +27,7 @@ Substitution, doc, ) +from pandas.util._exceptions import find_stack_level import pandas as pd from pandas import ( @@ -206,7 +208,7 @@ class Styler(StylerRenderer): Notes ----- Most styling will be done by passing style functions into - ``Styler.apply`` or ``Styler.applymap``. Style functions should + ``Styler.apply`` or ``Styler.map``. Style functions should return values with strings containing CSS ``'attr: value'`` that will be applied to the indicated cells. @@ -306,8 +308,8 @@ def concat(self, other: Styler) -> Styler: For example adding a sub total row, or displaying metrics such as means, variance or counts. - Styles that are applied using the ``apply``, ``applymap``, ``apply_index`` - and ``applymap_index``, and formatting applied with ``format`` and + Styles that are applied using the ``apply``, ``map``, ``apply_index`` + and ``map_index``, and formatting applied with ``format`` and ``format_index`` will be preserved. .. warning:: @@ -354,7 +356,7 @@ def concat(self, other: Styler) -> Styler: >>> other = (descriptors.style ... .highlight_max(axis=1, subset=(["Total", "Average"], slice(None))) ... .format(subset=("Average", slice(None)), precision=2, decimal=",") - ... .applymap(lambda v: "font-weight: bold;")) + ... .map(lambda v: "font-weight: bold;")) >>> styler = (df.style ... .highlight_max(color="salmon") ... .set_table_styles([{"selector": ".foot_row0", @@ -991,7 +993,7 @@ def to_latex( ... else: color = "#ffdd33" ... return f"color: {color}; font-weight: bold;" >>> (styler.background_gradient(cmap="inferno", subset="Equity", vmin=0, vmax=1) - ... .applymap(rating_color, subset="Rating")) # doctest: +SKIP + ... .map(rating_color, subset="Rating")) # doctest: +SKIP All the above styles will work with HTML (see below) and LaTeX upon conversion: @@ -1003,7 +1005,7 @@ def to_latex( as well as `--rwrap` to ensure this is formatted correctly and not ignored upon conversion. - >>> styler.applymap_index( + >>> styler.map_index( ... lambda v: "rotatebox:{45}--rwrap--latex;", level=2, axis=1 ... ) # doctest: +SKIP @@ -1514,7 +1516,7 @@ def _update_ctx(self, attrs: DataFrame) -> None: """ if not self.index.is_unique or not self.columns.is_unique: raise KeyError( - "`Styler.apply` and `.applymap` are not compatible " + "`Styler.apply` and `.map` are not compatible " "with non-unique index or columns." ) @@ -1745,9 +1747,9 @@ def apply( See Also -------- - Styler.applymap_index: Apply a CSS-styling function to headers elementwise. + Styler.map_index: Apply a CSS-styling function to headers elementwise. Styler.apply_index: Apply a CSS-styling function to headers level-wise. - Styler.applymap: Apply a CSS-styling function elementwise. + Styler.map: Apply a CSS-styling function elementwise. Notes ----- @@ -1813,7 +1815,7 @@ def _apply_index( if method == "apply": result = data.apply(func, axis=0, **kwargs) - elif method == "applymap": + elif method == "map": result = data.map(func, **kwargs) self._update_ctx_header(result, axis) @@ -1822,7 +1824,7 @@ def _apply_index( @doc( this="apply", wise="level-wise", - alt="applymap", + alt="map", altwise="elementwise", func="take a Series and return a string array of the same length", input_note="the index as a Series, if an Index, or a level of a MultiIndex", @@ -1845,6 +1847,9 @@ def apply_index( .. versionadded:: 1.4.0 + .. versionadded:: 2.1.0 + Styler.applymap_index was deprecated and renamed to Styler.map_index. + Parameters ---------- func : function @@ -1864,7 +1869,7 @@ def apply_index( -------- Styler.{alt}_index: Apply a CSS-styling function to headers {altwise}. Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. - Styler.applymap: Apply a CSS-styling function elementwise. + Styler.map: Apply a CSS-styling function elementwise. Notes ----- @@ -1905,7 +1910,7 @@ def apply_index( @doc( apply_index, - this="applymap", + this="map", wise="elementwise", alt="apply", altwise="level-wise", @@ -1916,7 +1921,7 @@ def apply_index( ret='"background-color: yellow;" if v == "B" else None', ret2='"background-color: yellow;" if "x" in v else None', ) - def applymap_index( + def map_index( self, func: Callable, axis: AxisInt | str = 0, @@ -1926,16 +1931,50 @@ def applymap_index( self._todo.append( ( lambda instance: getattr(instance, "_apply_index"), - (func, axis, level, "applymap"), + (func, axis, level, "map"), kwargs, ) ) return self - def _applymap( - self, func: Callable, subset: Subset | None = None, **kwargs + def applymap_index( + self, + func: Callable, + axis: AxisInt | str = 0, + level: Level | list[Level] | None = None, + **kwargs, ) -> Styler: - func = partial(func, **kwargs) # applymap doesn't take kwargs? + """ + Apply a CSS-styling function to the index or column headers, elementwise. + + .. deprecated:: 2.1.0 + + Styler.applymap_index has been deprecated. Use Styler.map_index instead. + + Parameters + ---------- + func : function + ``func`` should take a scalar and return a string. + axis : {{0, 1, "index", "columns"}} + The headers over which to apply the function. + level : int, str, list, optional + If index is MultiIndex the level(s) over which to apply the function. + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + Styler + """ + warnings.warn( + "Styler.applymap_index has been deprecated. Use Styler.map_index instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.map_index(func, axis, level, **kwargs) + + def _map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: + func = partial(func, **kwargs) # map doesn't take kwargs? if subset is None: subset = IndexSlice[:] subset = non_reducing_slice(subset) @@ -1944,9 +1983,7 @@ def _applymap( return self @Substitution(subset=subset_args) - def applymap( - self, func: Callable, subset: Subset | None = None, **kwargs - ) -> Styler: + def map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: """ Apply a CSS-styling function elementwise. @@ -1966,7 +2003,7 @@ def applymap( See Also -------- - Styler.applymap_index: Apply a CSS-styling function to headers elementwise. + Styler.map_index: Apply a CSS-styling function to headers elementwise. Styler.apply_index: Apply a CSS-styling function to headers level-wise. Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. @@ -1981,30 +2018,60 @@ def applymap( >>> def color_negative(v, color): ... return f"color: {color};" if v < 0 else None >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - >>> df.style.applymap(color_negative, color='red') # doctest: +SKIP + >>> df.style.map(color_negative, color='red') # doctest: +SKIP Using ``subset`` to restrict application to a single column or multiple columns - >>> df.style.applymap(color_negative, color='red', subset="A") + >>> df.style.map(color_negative, color='red', subset="A") ... # doctest: +SKIP - >>> df.style.applymap(color_negative, color='red', subset=["A", "B"]) + >>> df.style.map(color_negative, color='red', subset=["A", "B"]) ... # doctest: +SKIP Using a 2d input to ``subset`` to select rows in addition to columns - >>> df.style.applymap(color_negative, color='red', + >>> df.style.map(color_negative, color='red', ... subset=([0,1,2], slice(None))) # doctest: +SKIP - >>> df.style.applymap(color_negative, color='red', subset=(slice(0,5,2), "A")) + >>> df.style.map(color_negative, color='red', subset=(slice(0,5,2), "A")) ... # doctest: +SKIP See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. """ self._todo.append( - (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs) + (lambda instance: getattr(instance, "_map"), (func, subset), kwargs) ) return self + @Substitution(subset=subset_args) + def applymap( + self, func: Callable, subset: Subset | None = None, **kwargs + ) -> Styler: + """ + Apply a CSS-styling function elementwise. + + .. deprecated:: 2.1.0 + + Styler.applymap_index has been deprecated. Use Styler.map_index instead. + + Parameters + ---------- + func : function + ``func`` should take a scalar and return a string. + %(subset)s + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + Styler + """ + warnings.warn( + "Styler.applymap has been deprecated. Use Styler.map instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.map(func, subset, **kwargs) + def set_table_attributes(self, attributes: str) -> Styler: """ Set the table attributes added to the ```` HTML element. @@ -2058,7 +2125,7 @@ def export(self) -> dict[str, Any]: The following items are exported since they are not generally data dependent: - - Styling functions added by the ``apply`` and ``applymap`` + - Styling functions added by the ``apply`` and ``map`` - Whether axes and names are hidden from the display, if unambiguous. - Table attributes - Table styles @@ -2104,7 +2171,7 @@ def use(self, styles: dict[str, Any]) -> Styler: styles : dict(str, Any) List of attributes to add to Styler. Dict keys should contain only: - "apply": list of styler functions, typically added with ``apply`` or - ``applymap``. + ``map``. - "table_attributes": HTML attributes, typically added with ``set_table_attributes``. - "table_styles": CSS selectors and properties, typically added with @@ -2893,7 +2960,7 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: Notes ----- - This is a convenience methods which wraps the :meth:`Styler.applymap` calling a + This is a convenience methods which wraps the :meth:`Styler.map` calling a function returning the CSS-properties independently of the data. Examples @@ -2906,7 +2973,7 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: more details. """ values = "".join([f"{p}: {v};" for p, v in kwargs.items()]) - return self.applymap(lambda x: values, subset=subset) + return self.map(lambda x: values, subset=subset) @Substitution(subset=subset_args) def bar( # pylint: disable=disallowed-name diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 5b608089945a2..3f4273648b4d2 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -247,7 +247,7 @@ def _compute(self): Execute the style functions built up in `self._todo`. Relies on the conventions that all style functions go through - .apply or .applymap. The append styles to apply as tuples of + .apply or .map. The append styles to apply as tuples of (application method, *args, **kwargs) """ @@ -1168,7 +1168,7 @@ def format( >>> df = pd.DataFrame({"A": [1, 0, -1]}) >>> pseudo_css = "number-format: 0§[Red](0)§-§@;" >>> filename = "formatted_file.xlsx" - >>> df.style.applymap(lambda v: pseudo_css).to_excel(filename) # doctest: +SKIP + >>> df.style.map(lambda v: pseudo_css).to_excel(filename) # doctest: +SKIP .. figure:: ../../_static/style/format_excel_css.png """ diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index d28296ec2e380..b8d41164792e0 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -254,7 +254,7 @@ def test_to_excel_with_openpyxl_engine(ext): df1 = DataFrame({"A": np.linspace(1, 10, 10)}) df2 = DataFrame({"B": np.linspace(1, 20, 10)}) df = pd.concat([df1, df2], axis=1) - styled = df.style.applymap( + styled = df.style.map( lambda val: f"color: {'red' if val < 0 else 'black'}" ).highlight_max() diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index b26d59b9bdebb..0220fceb6347f 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -131,7 +131,7 @@ def test_styler_to_excel_unstyled(engine): def test_styler_to_excel_basic(engine, css, attrs, expected): pytest.importorskip(engine) df = DataFrame(np.random.randn(1, 1)) - styler = df.style.applymap(lambda x: css) + styler = df.style.map(lambda x: css) with tm.ensure_clean(".xlsx") as path: with ExcelWriter(path, engine=engine) as writer: @@ -164,13 +164,13 @@ def test_styler_to_excel_basic_indexes(engine, css, attrs, expected): df = DataFrame(np.random.randn(1, 1)) styler = df.style - styler.applymap_index(lambda x: css, axis=0) - styler.applymap_index(lambda x: css, axis=1) + styler.map_index(lambda x: css, axis=0) + styler.map_index(lambda x: css, axis=1) null_styler = df.style - null_styler.applymap(lambda x: "null: css;") - null_styler.applymap_index(lambda x: "null: css;", axis=0) - null_styler.applymap_index(lambda x: "null: css;", axis=1) + null_styler.map(lambda x: "null: css;") + null_styler.map_index(lambda x: "null: css;", axis=0) + null_styler.map_index(lambda x: "null: css;", axis=1) with tm.ensure_clean(".xlsx") as path: with ExcelWriter(path, engine=engine) as writer: @@ -231,7 +231,7 @@ def test_styler_to_excel_border_style(engine, border_style): pytest.importorskip(engine) df = DataFrame(np.random.randn(1, 1)) - styler = df.style.applymap(lambda x: css) + styler = df.style.map(lambda x: css) with tm.ensure_clean(".xlsx") as path: with ExcelWriter(path, engine=engine) as writer: @@ -261,7 +261,7 @@ def custom_converter(css): return {"font": {"color": {"rgb": "111222"}}} df = DataFrame(np.random.randn(1, 1)) - styler = df.style.applymap(lambda x: "color: #888999") + styler = df.style.map(lambda x: "color: #888999") with tm.ensure_clean(".xlsx") as path: with ExcelWriter(path, engine="openpyxl") as writer: ExcelFormatter(styler, style_converter=custom_converter).write( diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index 1867260fbc4b4..67f7e12fcc3c2 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -85,11 +85,9 @@ def test_exclude_styles(styler): def test_w3_html_format(styler): - styler.set_uuid("").set_table_styles( - [{"selector": "th", "props": "att2:v2;"}] - ).applymap(lambda x: "att1:v1;").set_table_attributes( - 'class="my-cls1" style="attr3:v3;"' - ).set_td_classes( + styler.set_uuid("").set_table_styles([{"selector": "th", "props": "att2:v2;"}]).map( + lambda x: "att1:v1;" + ).set_table_attributes('class="my-cls1" style="attr3:v3;"').set_td_classes( DataFrame(["my-cls2"], index=["a"], columns=["A"]) ).format( "{:.1f}" @@ -428,14 +426,14 @@ def test_sparse_options(sparse_index, sparse_columns): @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("columns", [True, False]) -def test_applymap_header_cell_ids(styler, index, columns): +def test_map_header_cell_ids(styler, index, columns): # GH 41893 func = lambda v: "attr: val;" styler.uuid, styler.cell_ids = "", False if index: - styler.applymap_index(func, axis="index") + styler.map_index(func, axis="index") if columns: - styler.applymap_index(func, axis="columns") + styler.map_index(func, axis="columns") result = styler.to_html() @@ -493,9 +491,9 @@ def test_replaced_css_class_names(): styler_mi.index.names = ["n1", "n2"] styler_mi.hide(styler_mi.index[1:], axis=0) styler_mi.hide(styler_mi.columns[1:], axis=1) - styler_mi.applymap_index(lambda v: "color: red;", axis=0) - styler_mi.applymap_index(lambda v: "color: green;", axis=1) - styler_mi.applymap(lambda v: "color: blue;") + styler_mi.map_index(lambda v: "color: red;", axis=0) + styler_mi.map_index(lambda v: "color: green;", axis=1) + styler_mi.map(lambda v: "color: blue;") expected = dedent( """\