From bdf34c6aed73ce4c8825e7927ca3dee4f0fceca6 Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Sun, 5 Sep 2021 04:59:29 +0530 Subject: [PATCH 1/4] Backport PR #43150: BUG: GroupBy.quantile fails with pd.NA --- doc/source/whatsnew/v1.3.3.rst | 1 + pandas/core/groupby/groupby.py | 4 +++ pandas/tests/groupby/test_quantile.py | 39 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 00409cf963ab3..46080e1847317 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) +- Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`) - Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`) - Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`) - Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5a70db517ad12..7a22d83dfe24b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -63,6 +63,7 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -2438,6 +2439,9 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: elif is_timedelta64_dtype(vals.dtype): inference = np.dtype("timedelta64[ns]") out = np.asarray(vals).astype(float) + elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): + inference = np.dtype(np.float64) + out = vals.to_numpy(dtype=float, na_value=np.nan) else: out = np.asarray(vals) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 90437b9139594..83d6c20bcac24 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -248,6 +248,45 @@ def test_groupby_quantile_skips_invalid_dtype(q): tm.assert_frame_equal(result, expected) +def test_groupby_quantile_NA_float(any_float_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([0.2], dtype=float, index=[1.0], name="y") + expected.index.name = "x" + tm.assert_series_equal(expected, result) + + result = df.groupby("x")["y"].quantile([0.5, 0.75]) + expected = pd.Series( + [0.2] * 2, + index=pd.MultiIndex.from_product(([1.0], [0.5, 0.75]), names=["x", None]), + name="y", + ) + tm.assert_series_equal(result, expected) + + +def test_groupby_quantile_NA_int(any_int_ea_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([3.5], dtype=float, index=Index([1], name="x"), name="y") + tm.assert_series_equal(expected, result) + + result = df.groupby("x").quantile(0.5) + expected = DataFrame({"y": 3.5}, index=Index([1], name="x")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Float64", "Float32"]) +def test_groupby_quantile_allNA_column(dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([np.nan], dtype=float, index=[1.0], name="y") + expected.index.name = "x" + tm.assert_series_equal(expected, result) + + def test_groupby_timedelta_quantile(): # GH: 29485 df = DataFrame( From 6a54ac4a0a76a6128f535dd430c1ab9985f86cd7 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 5 Sep 2021 19:35:20 +0530 Subject: [PATCH 2/4] used 1.3.x fixtures in test --- pandas/tests/groupby/test_quantile.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 83d6c20bcac24..3a1d40c47b096 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -248,9 +248,11 @@ def test_groupby_quantile_skips_invalid_dtype(q): tm.assert_frame_equal(result, expected) -def test_groupby_quantile_NA_float(any_float_dtype): +def test_groupby_quantile_NA_float(any_float_allowed_nullable_dtype): # GH#42849 - df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) + df = DataFrame( + {"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_allowed_nullable_dtype + ) result = df.groupby("x")["y"].quantile(0.5) expected = pd.Series([0.2], dtype=float, index=[1.0], name="y") expected.index.name = "x" @@ -265,9 +267,9 @@ def test_groupby_quantile_NA_float(any_float_dtype): tm.assert_series_equal(result, expected) -def test_groupby_quantile_NA_int(any_int_ea_dtype): +def test_groupby_quantile_NA_int(any_nullable_int_dtype): # GH#42849 - df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) + df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_nullable_int_dtype) result = df.groupby("x")["y"].quantile(0.5) expected = pd.Series([3.5], dtype=float, index=Index([1], name="x"), name="y") tm.assert_series_equal(expected, result) From 17b373d4bd23f06e5601189cbef90dc30b578369 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 6 Sep 2021 21:39:01 +0530 Subject: [PATCH 3/4] changed float with object --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7a22d83dfe24b..e9fe35bc04c2c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2440,7 +2440,7 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: inference = np.dtype("timedelta64[ns]") out = np.asarray(vals).astype(float) elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): - inference = np.dtype(np.float64) + inference = np.dtype(object) out = vals.to_numpy(dtype=float, na_value=np.nan) else: out = np.asarray(vals) From a55b74bb1adeea4ef11602d26397a153ccb4caf0 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 9 Sep 2021 12:48:04 +0530 Subject: [PATCH 4/4] suggested change --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_quantile.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e9fe35bc04c2c..7a22d83dfe24b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2440,7 +2440,7 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: inference = np.dtype("timedelta64[ns]") out = np.asarray(vals).astype(float) elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): - inference = np.dtype(object) + inference = np.dtype(np.float64) out = vals.to_numpy(dtype=float, na_value=np.nan) else: out = np.asarray(vals) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 3a1d40c47b096..ebcc31226b895 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -254,14 +254,15 @@ def test_groupby_quantile_NA_float(any_float_allowed_nullable_dtype): {"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_allowed_nullable_dtype ) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([0.2], dtype=float, index=[1.0], name="y") - expected.index.name = "x" + expected = pd.Series([0.2], dtype=float, index=Index(df["x"][:1]), name="y") tm.assert_series_equal(expected, result) result = df.groupby("x")["y"].quantile([0.5, 0.75]) expected = pd.Series( [0.2] * 2, - index=pd.MultiIndex.from_product(([1.0], [0.5, 0.75]), names=["x", None]), + index=pd.MultiIndex.from_arrays( + [Index(df["x"]), [0.5, 0.75]], names=["x", None] + ), name="y", ) tm.assert_series_equal(result, expected) @@ -271,11 +272,11 @@ def test_groupby_quantile_NA_int(any_nullable_int_dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_nullable_int_dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([3.5], dtype=float, index=Index([1], name="x"), name="y") + expected = pd.Series([3.5], dtype=float, index=Index(df["x"][:1]), name="y") tm.assert_series_equal(expected, result) result = df.groupby("x").quantile(0.5) - expected = DataFrame({"y": 3.5}, index=Index([1], name="x")) + expected = DataFrame({"y": 3.5}, index=Index(df["x"][:1])) tm.assert_frame_equal(result, expected) @@ -284,8 +285,7 @@ def test_groupby_quantile_allNA_column(dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([np.nan], dtype=float, index=[1.0], name="y") - expected.index.name = "x" + expected = pd.Series([np.nan], dtype=float, index=Index(df["x"][:1]), name="y") tm.assert_series_equal(expected, result)