-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Min/max does not work for dates with timezones if there are missing values in the data frame #44222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: Min/max does not work for dates with timezones if there are missing values in the data frame #44222
Changes from all commits
f952745
c6e5078
3a88582
abb2249
330a76b
6b17d3d
3e6c88f
bc64980
145f768
4b06297
f096c6f
a29363d
da67a2d
fe6c5ff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,9 @@ | |
DataFrame, | ||
Index, | ||
MultiIndex, | ||
PeriodDtype, | ||
Series, | ||
Timedelta, | ||
Timestamp, | ||
date_range, | ||
isna, | ||
|
@@ -756,7 +758,7 @@ def test_operators_timedelta64(self): | |
# excludes numeric | ||
with tm.assert_produces_warning(FutureWarning, match="Select only valid"): | ||
result = mixed.min(axis=1) | ||
expected = Series([1, 1, 1.0], index=[0, 1, 2]) | ||
expected = Series([]) | ||
tm.assert_series_equal(result, expected) | ||
|
||
# works when only those columns are selected | ||
|
@@ -1763,3 +1765,141 @@ def test_prod_sum_min_count_mixed_object(): | |
msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") | ||
with pytest.raises(TypeError, match=msg): | ||
df.sum(axis=0, min_count=1, numeric_only=False) | ||
|
||
|
||
def test_timezone_min_max_with_nat(): | ||
# GH#27794 | ||
df = pd.DataFrame( | ||
{ | ||
"A": pd.date_range(start="2018-01-01", end="2018-01-03", tz="UTC"), | ||
"B": pd.date_range(start="2018-01-01", end="2018-01-02", tz="UTC").insert( | ||
2, pd.NaT | ||
), | ||
} | ||
) | ||
|
||
expected = pd.Series( | ||
[ | ||
pd.Timestamp("2018-01-01", tz="UTC"), | ||
pd.Timestamp("2018-01-02", tz="UTC"), | ||
pd.Timestamp("2018-01-03", tz="UTC"), | ||
], | ||
) | ||
result = df.min(axis=1) | ||
tm.assert_series_equal(result, expected) | ||
|
||
expected = pd.Series( | ||
[ | ||
pd.Timestamp("2018-01-01", tz="UTC"), | ||
pd.Timestamp("2018-01-02", tz="UTC"), | ||
pd.Timestamp("2018-01-03", tz="UTC"), | ||
], | ||
) | ||
result = df.max(axis=1) | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_min_max_timestamp_timezone_nat(): | ||
# GH#44196 | ||
rng_with_tz = pd.date_range( | ||
start="2021-10-01T12:00:00+02:00", end="2021-10-02T12:00:00+02:00", freq="4H" | ||
) | ||
df_with_tz = DataFrame( | ||
data={"A": rng_with_tz, "B": rng_with_tz + pd.Timedelta(minutes=20)} | ||
) | ||
df_with_tz.iloc[2, 1] = pd.NaT | ||
|
||
result = df_with_tz.max(axis=1) | ||
expected = pd.Series( | ||
[ | ||
pd.Timestamp("2021-10-01T12:20:00+02:00"), | ||
pd.Timestamp("2021-10-01T16:20:00+02:00"), | ||
pd.Timestamp("2021-10-01T20:00:00+02:00"), | ||
pd.Timestamp("2021-10-02T00:20:00+02:00"), | ||
pd.Timestamp("2021-10-02T04:20:00+02:00"), | ||
pd.Timestamp("2021-10-02T08:20:00+02:00"), | ||
pd.Timestamp("2021-10-02T12:20:00+02:00"), | ||
] | ||
) | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_timezone_min_max_both_axis(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same bugs probably also affect timedelta64 and PeriodDtype? can you test those too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added tests for this cases but am not sure if I understood you correctly. This bug just appears for timezone aware data, afaik timedelta and PeriodDtype don't have timezone information, therefore I don't expect the bug to exist. Do you mean adding these types to a datetime column? |
||
rng_with_tz = pd.date_range( | ||
start="2021-10-01T12:00:00+02:00", end="2021-10-02T12:00:00+02:00", freq="4H" | ||
) | ||
df_with_tz = DataFrame( | ||
data={"A": rng_with_tz, "B": rng_with_tz + pd.Timedelta(minutes=20)} | ||
) | ||
df_with_tz.iloc[2, 1] = pd.NaT | ||
|
||
result = df_with_tz.max(axis=1) | ||
expected = df_with_tz.T.max(axis=0) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
result = df_with_tz.min(axis=1) | ||
expected = df_with_tz.T.min(axis=0) | ||
|
||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_min_max_timedelta64_nat(): | ||
df = DataFrame( | ||
[ | ||
[Timedelta(minutes=20), Timedelta(days=2), Timedelta(seconds=3)], | ||
[Timedelta(minutes=2, seconds=2), Timedelta(days=2, minutes=30), pd.NaT], | ||
] | ||
) | ||
expected = pd.Series( | ||
[Timedelta(minutes=2, seconds=2), Timedelta(days=2), Timedelta(seconds=3)] | ||
) | ||
result = df.min(axis=0) | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(df.min(axis=0), df.T.min(axis=1)) | ||
|
||
expected = pd.Series([Timedelta(seconds=3), Timedelta(minutes=2, seconds=2)]) | ||
result = df.min(axis=1) | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(df.min(axis=1), df.T.min(axis=0)) | ||
|
||
expected = pd.Series( | ||
[Timedelta(minutes=20), Timedelta(days=2, minutes=30), Timedelta(seconds=3)] | ||
) | ||
result = df.max(axis=0) | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(df.max(axis=0), df.T.max(axis=1)) | ||
|
||
expected = pd.Series([Timedelta(days=2), Timedelta(days=2, minutes=30)]) | ||
result = df.max(axis=1) | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(df.max(axis=1), df.T.max(axis=0)) | ||
|
||
|
||
def test_min_max_perioddtype_nat(): | ||
df = DataFrame( | ||
[ | ||
[PeriodDtype(freq="20m"), PeriodDtype(freq="1h"), PeriodDtype(freq="1d")], | ||
[PeriodDtype(freq="25m"), PeriodDtype(freq="2h"), pd.NaT], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Try period_range, or even just take the frame from the dt64tz case and to a .to_period("D") on it |
||
] | ||
) | ||
|
||
expected = Series([]) | ||
result = df.min(axis=0) | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(df.min(axis=0), df.T.min(axis=1)) | ||
|
||
expected = Series([]) | ||
result = df.min(axis=1) | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(df.min(axis=1), df.T.min(axis=0)) | ||
|
||
expected = Series([]) | ||
result = df.max(axis=0) | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(df.max(axis=0), df.T.max(axis=1)) | ||
|
||
expected = Series([]) | ||
result = df.max(axis=1) | ||
tm.assert_series_equal(result, expected) | ||
tm.assert_series_equal(df.max(axis=1), df.T.max(axis=0)) |
Uh oh!
There was an error while loading. Please reload this page.