Skip to content
223 changes: 134 additions & 89 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,26 @@
from pandas.util import _test_decorators as td


@pytest.fixture(
params=[np.int32, np.int64, np.float32, np.float64],
ids=["np.int32", "np.int64", "np.float32", "np.float64"],
)
def numpy_dtypes_for_minmax(request):
"""
Fixture of numpy dtypes with min and max values used for testing
cummin and cummax
"""
dtype = request.param
min_val = (
np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min
)
max_val = (
np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max
)

return (dtype, min_val, max_val)


@pytest.mark.parametrize("agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -174,11 +194,10 @@ def test_arg_passthru():
)

for attr in ["mean", "median"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_frame_equal(result.reindex_like(expected), expected)

# TODO: min, max *should* handle
Expand All @@ -195,11 +214,10 @@ def test_arg_passthru():
]
)
for attr in ["min", "max"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(
Expand All @@ -215,52 +233,47 @@ def test_arg_passthru():
]
)
for attr in ["first", "last"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "string", "category_int", "timedelta"])
for attr in ["sum"]:
f = getattr(df.groupby("group"), attr)
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
result = df.groupby("group").sum()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = df.groupby("group").sum(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "category_int"])
for attr in ["prod", "cumprod"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

# like min, max, but don't include strings
expected_columns = Index(
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
)
for attr in ["cummin", "cummax"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
# GH 15561: numeric_only=False set by default like min/max
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "category_int", "timedelta"])
for attr in ["cumsum"]:
f = getattr(df.groupby("group"), attr)
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
result = getattr(df.groupby("group"), "cumsum")()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = getattr(df.groupby("group"), "cumsum")(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)


def test_non_cython_api():
Expand Down Expand Up @@ -691,59 +704,31 @@ def test_numpy_compat(func):
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummin_cummax():
def test_cummin(numpy_dtypes_for_minmax):
dtype = numpy_dtypes_for_minmax[0]
min_val = numpy_dtypes_for_minmax[1]

# GH 15048
num_types = [np.int32, np.int64, np.float32, np.float64]
num_mins = [
np.iinfo(np.int32).min,
np.iinfo(np.int64).min,
np.finfo(np.float32).min,
np.finfo(np.float64).min,
]
num_max = [
np.iinfo(np.int32).max,
np.iinfo(np.int64).max,
np.finfo(np.float32).max,
np.finfo(np.float64).max,
]
base_df = pd.DataFrame(
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
)
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
df = base_df.astype(dtype)
df = base_df.astype(dtype)

# cummin
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummin w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# cummax
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummax w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
Expand All @@ -753,41 +738,101 @@ def test_cummin_cummax():
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# GH 15561
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)

# GH 15635
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = pd.Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummin_all_nan_column():
base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})

# Test nan in entire column
base_df["B"] = np.nan
expected = pd.DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(expected, result)


@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummax(numpy_dtypes_for_minmax):
dtype = numpy_dtypes_for_minmax[0]
max_val = numpy_dtypes_for_minmax[2]

# GH 15048
base_df = pd.DataFrame(
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
)
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

df = base_df.astype(dtype)

expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(expected, result)
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# GH 15561
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")
for method in ["cummax", "cummin"]:
result = getattr(df.groupby("a")["b"], method)()
tm.assert_series_equal(expected, result)

result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)

# GH 15635
df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
result = df.groupby("a").b.cummax()
expected = pd.Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)

df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = pd.Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummax_all_nan_column():
base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})

expected = pd.DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(expected, result)


@pytest.mark.parametrize(
Expand Down