diff --git a/bigframes/core/resample.py b/bigframes/core/resample.py new file mode 100644 index 0000000000..5177dd69f3 --- /dev/null +++ b/bigframes/core/resample.py @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core.groupby import DataFrameGroupBy + + +class Resampler(DataFrameGroupBy): + def __init__(self, obj, by, **kwargs): + super().__init__(obj, by, **kwargs) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9cb388329e..ce7c932049 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3679,79 +3679,14 @@ def _split( blocks = self._block.split(ns=ns, fracs=fracs, random_state=random_state) return [DataFrame(block) for block in blocks] - @validations.requires_ordering() - def _resample( + def resample( self, rule: str, *, on: blocks.Label = None, - level: Optional[LevelsType] = None, - origin: Union[ - Union[ - pandas.Timestamp, datetime.datetime, numpy.datetime64, int, float, str - ], - Literal["epoch", "start", "start_day", "end", "end_day"], - ] = "start_day", + level: Optional[LevelType] = None, + origin: Literal["epoch", "start", "start_day"] = "start_day", ) -> bigframes.core.groupby.DataFrameGroupBy: - """Internal function to support resample. Resample time-series data. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - - >>> data = { - ... "timestamp_col": pd.date_range( - ... start="2021-01-01 13:00:00", periods=30, freq="1s" - ... ), - ... "int64_col": range(30), - ... "int64_too": range(10, 40), - ... } - - Resample on a DataFrame with index: - - >>> df = bpd.DataFrame(data).set_index("timestamp_col") - >>> df._resample(rule="7s").min() - int64_col int64_too - 2021-01-01 12:59:55 0 10 - 2021-01-01 13:00:02 2 12 - 2021-01-01 13:00:09 9 19 - 2021-01-01 13:00:16 16 26 - 2021-01-01 13:00:23 23 33 - - [5 rows x 2 columns] - - Resample with column and origin set to 'start': - - >>> df = bpd.DataFrame(data) - >>> df._resample(rule="7s", on = "timestamp_col", origin="start").min() - int64_col int64_too - 2021-01-01 13:00:00 0 10 - 2021-01-01 13:00:07 7 17 - 2021-01-01 13:00:14 14 24 - 2021-01-01 13:00:21 21 31 - 2021-01-01 13:00:28 28 38 - - [5 rows x 2 columns] - - Args: - rule (str): - The offset string representing target conversion. - on (str, default None): - For a DataFrame, column to use instead of index for resampling. Column - must be datetime-like. - level (str or int, default None): - For a MultiIndex, level (name or number) to use for resampling. - level must be datetime-like. - origin(str, default 'start_day'): - The timestamp on which to adjust the grouping. Must be one of the following: - 'epoch': origin is 1970-01-01 - 'start': origin is the first value of the timeseries - 'start_day': origin is the first day at midnight of the timeseries - Returns: - DataFrameGroupBy: DataFrameGroupBy object. - """ block = self._block._generate_resample_label( rule=rule, on=on, @@ -3759,7 +3694,12 @@ def _resample( origin=origin, ) df = DataFrame(block) - return df.groupby(level=0) + return groupby.DataFrameGroupBy( + df._block, + by_col_ids=df._resolve_levels(0), + as_index=True, + dropna=True, + ) @classmethod def from_dict( diff --git a/bigframes/series.py b/bigframes/series.py index 87f1f1d141..e315c9a849 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2172,65 +2172,16 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: ) ) - @validations.requires_ordering() - def _resample( + def resample( self, rule: str, *, - closed: Optional[Literal["right", "left"]] = None, - label: Optional[Literal["right", "left"]] = None, - level: Optional[LevelsType] = None, - origin: Union[ - Union[ - pandas.Timestamp, datetime.datetime, numpy.datetime64, int, float, str - ], - Literal["epoch", "start", "start_day", "end", "end_day"], - ] = "start_day", + on: blocks.Label = None, + level: Optional[LevelType] = None, + origin: Literal["epoch", "start", "start_day"] = "start_day", ) -> bigframes.core.groupby.SeriesGroupBy: - """Internal function to support resample. Resample time-series data. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import pandas as pd - >>> bpd.options.display.progress_bar = None - - >>> data = { - ... "timestamp_col": pd.date_range( - ... start="2021-01-01 13:00:00", periods=30, freq="1s" - ... ), - ... "int64_col": range(30), - ... } - >>> s = bpd.DataFrame(data).set_index("timestamp_col") - >>> s._resample(rule="7s", origin="epoch").min() - int64_col - 2021-01-01 12:59:56 0 - 2021-01-01 13:00:03 3 - 2021-01-01 13:00:10 10 - 2021-01-01 13:00:17 17 - 2021-01-01 13:00:24 24 - - [5 rows x 1 columns] - - - Args: - rule (str): - The offset string representing target conversion. - level (str or int, default None): - For a MultiIndex, level (name or number) to use for resampling. - level must be datetime-like. - origin(str, default 'start_day'): - The timestamp on which to adjust the grouping. Must be one of the following: - 'epoch': origin is 1970-01-01 - 'start': origin is the first value of the timeseries - 'start_day': origin is the first day at midnight of the timeseries - Returns: - SeriesGroupBy: SeriesGroupBy object. - """ block = self._block._generate_resample_label( rule=rule, - closed=closed, - label=label, on=None, level=level, origin=origin, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 362d736aeb..4e8f9e7072 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5418,13 +5418,13 @@ def test_dataframe_explode_xfail(col_names): ), ], ) -def test__resample_with_column( +def test_resample_with_column( scalars_df_index, scalars_pandas_df_index, on, rule, origin ): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") bf_result = ( - scalars_df_index._resample(rule=rule, on=on, origin=origin)[ + scalars_df_index.resample(rule=rule, on=on, origin=origin)[ ["int64_col", "int64_too"] ] .max() @@ -5446,7 +5446,7 @@ def test__resample_with_column( pytest.param(False, None, "datetime_col", "100d"), ], ) -def test__resample_with_index( +def test_resample_with_index( scalars_df_index, scalars_pandas_df_index, append, level, col, rule ): # TODO: supply a reason why this isn't compatible with pandas 1.x @@ -5455,7 +5455,7 @@ def test__resample_with_index( scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) bf_result = ( scalars_df_index[["int64_col", "int64_too"]] - ._resample(rule=rule, level=level) + .resample(rule=rule, level=level) .min() .to_pandas() ) @@ -5505,7 +5505,7 @@ def test__resample_with_index( ), ], ) -def test__resample_start_time(rule, origin, data): +def test_resample_start_time(rule, origin, data): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") col = "timestamp_col" @@ -5513,7 +5513,7 @@ def test__resample_start_time(rule, origin, data): scalars_pandas_df_index = pd.DataFrame(data).set_index(col) scalars_pandas_df_index.index.name = None - bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c63bf8e12b..03eafbb83a 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4398,14 +4398,14 @@ def test_series_explode_null(data): pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), ], ) -def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): +def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ "int64_col" ] - bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index f6a56af7ff..a26c6f143a 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -250,7 +250,7 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session): ), ], ) -def test__resample_with_index(unordered_session, rule, origin, data): +def test_resample_with_index(unordered_session, rule, origin, data): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") col = "timestamp_col" @@ -258,7 +258,7 @@ def test__resample_with_index(unordered_session, rule, origin, data): scalars_pandas_df_index = pd.DataFrame(data).set_index(col) scalars_pandas_df_index.index.name = None - bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 4c9d1338f4..69939f4cfa 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -1,7 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/generic.py from __future__ import annotations -from typing import Callable, Iterator, Literal, Optional, TYPE_CHECKING +from typing import Callable, Hashable, Iterator, Literal, Optional, TYPE_CHECKING import bigframes_vendored.constants as constants from bigframes_vendored.pandas.core import indexing @@ -1271,3 +1271,78 @@ def equals(self, other) -> bool: otherwise. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def resample( + self, + rule: str, + *, + on: Hashable = None, + level: Optional[Hashable] = None, + origin: Literal["epoch", "start", "start_day"] = "start_day", + ): + """ + Resample time-series data. + + Convenience method for frequency conversion and resampling of time + series. The object must have a datetime index or the caller must + pass the label of a datetime series/index to the on/level keyword + parameter. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pandas as pd + >>> bpd.options.display.progress_bar = None + + >>> data = { + ... "timestamp_col": pd.date_range( + ... start="2021-01-01 13:00:00", periods=30, freq="1s" + ... ), + ... "int64_col": range(30), + ... "int64_too": range(10, 40), + ... } + + Resample on a DataFrame with index: + + >>> df = bpd.DataFrame(data).set_index("timestamp_col") + >>> df.resample(rule="7s").min() + int64_col int64_too + 2021-01-01 12:59:55 0 10 + 2021-01-01 13:00:02 2 12 + 2021-01-01 13:00:09 9 19 + 2021-01-01 13:00:16 16 26 + 2021-01-01 13:00:23 23 33 + + [5 rows x 2 columns] + + Resample with column and origin set to 'start': + + >>> df = bpd.DataFrame(data) + >>> df.resample(rule="7s", on = "timestamp_col", origin="start").min() + int64_col int64_too + 2021-01-01 13:00:00 0 10 + 2021-01-01 13:00:07 7 17 + 2021-01-01 13:00:14 14 24 + 2021-01-01 13:00:21 21 31 + 2021-01-01 13:00:28 28 38 + + [5 rows x 2 columns] + + Args: + rule (str): + The offset string representing target conversion. + on (str, default None): + For a DataFrame, column to use instead of index for resampling. Column + must be datetime-like. + level (str or int, default None): + For a MultiIndex, level (name or number) to use for resampling. + level must be datetime-like. + origin(str, default 'start_day'): + The timestamp on which to adjust the grouping. Must be one of the following: + 'epoch': origin is 1970-01-01 + 'start': origin is the first value of the timeseries + 'start_day': origin is the first day at midnight of the timeseries + Returns: + Resampler: Resampler object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)