From 710ad67950bfcced88d7383b5012f1a8196f6caf Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 Aug 2025 00:12:50 +0000 Subject: [PATCH] feat: Add reset_index names, col_level, col_fill, allow_duplicates args --- bigframes/core/blocks.py | 24 ++++++++-- bigframes/dataframe.py | 47 +++++++++++++++++-- bigframes/series.py | 8 +++- tests/system/small/test_dataframe.py | 26 ++++++++++ tests/system/small/test_multiindex.py | 22 +++++++-- tests/system/small/test_series.py | 26 ++++++++++ .../bigframes_vendored/pandas/core/frame.py | 17 +++++++ .../bigframes_vendored/pandas/core/series.py | 3 ++ 8 files changed, 162 insertions(+), 11 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index d2662da509..1a2544704c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -387,12 +387,21 @@ def reversed(self) -> Block: index_labels=self.index.names, ) - def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block: + def reset_index( + self, + level: LevelsType = None, + drop: bool = True, + *, + col_level: Union[str, int] = 0, + col_fill: typing.Hashable = "", + allow_duplicates: bool = False, + ) -> Block: """Reset the index of the block, promoting the old index to a value column. Arguments: level: the label or index level of the index levels to remove. name: this is the column id for the new value id derived from the old index + allow_duplicates: Returns: A new Block because dropping index columns can break references @@ -438,6 +447,11 @@ def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block: ) else: # Add index names to column index + col_level_n = ( + col_level + if isinstance(col_level, int) + else self.column_labels.names.index(col_level) + ) column_labels_modified = self.column_labels for position, level_id in enumerate(level_ids): label = self.col_id_to_index_name[level_id] @@ -447,11 +461,15 @@ def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block: else: label = f"level_{self.index_columns.index(level_id)}" - if label in self.column_labels: + if (not allow_duplicates) and (label in self.column_labels): raise ValueError(f"cannot insert {label}, already exists") + if isinstance(self.column_labels, pd.MultiIndex): nlevels = self.column_labels.nlevels - label = tuple(label if i == 0 else "" for i in range(nlevels)) + label = tuple( + label if i == col_level_n else col_fill for i in range(nlevels) + ) + # Create index copy with label inserted # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html column_labels_modified = column_labels_modified.insert(position, label) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a76027fbd6..921893fb83 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2321,6 +2321,10 @@ def reset_index( level: blocks.LevelsType = ..., drop: bool = ..., inplace: Literal[False] = ..., + col_level: Union[int, str] = ..., + col_fill: Hashable = ..., + allow_duplicates: Optional[bool] = ..., + names: Union[None, Hashable, Sequence[Hashable]] = ..., ) -> DataFrame: ... @@ -2330,19 +2334,56 @@ def reset_index( level: blocks.LevelsType = ..., drop: bool = ..., inplace: Literal[True] = ..., + col_level: Union[int, str] = ..., + col_fill: Hashable = ..., + allow_duplicates: Optional[bool] = ..., + names: Union[None, Hashable, Sequence[Hashable]] = ..., ) -> None: ... @overload def reset_index( - self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = ... + self, + level: blocks.LevelsType = None, + drop: bool = False, + inplace: bool = ..., + col_level: Union[int, str] = ..., + col_fill: Hashable = ..., + allow_duplicates: Optional[bool] = ..., + names: Union[None, Hashable, Sequence[Hashable]] = ..., ) -> Optional[DataFrame]: ... def reset_index( - self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = False + self, + level: blocks.LevelsType = None, + drop: bool = False, + inplace: bool = False, + col_level: Union[int, str] = 0, + col_fill: Hashable = "", + allow_duplicates: Optional[bool] = None, + names: Union[None, Hashable, Sequence[Hashable]] = None, ) -> Optional[DataFrame]: - block = self._block.reset_index(level, drop) + block = self._block + if names: + if isinstance(names, blocks.Label) and not isinstance(names, tuple): + names = [names] + else: + names = list(names) + + if len(names) != self.index.nlevels: + raise ValueError("'names' must be same length as levels") + + block = block.with_index_labels(names) + if allow_duplicates is None: + allow_duplicates = False + block = block.reset_index( + level, + drop, + col_level=col_level, + col_fill=col_fill, + allow_duplicates=allow_duplicates, + ) if inplace: self._set_block(block) return None diff --git a/bigframes/series.py b/bigframes/series.py index 6f48935ec9..58bd47bff0 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -414,6 +414,7 @@ def reset_index( name: typing.Optional[str] = ..., drop: Literal[False] = ..., inplace: Literal[False] = ..., + allow_duplicates: Optional[bool] = ..., ) -> bigframes.dataframe.DataFrame: ... @@ -425,6 +426,7 @@ def reset_index( name: typing.Optional[str] = ..., drop: Literal[True] = ..., inplace: Literal[False] = ..., + allow_duplicates: Optional[bool] = ..., ) -> Series: ... @@ -436,6 +438,7 @@ def reset_index( name: typing.Optional[str] = ..., drop: bool = ..., inplace: Literal[True] = ..., + allow_duplicates: Optional[bool] = ..., ) -> None: ... @@ -447,8 +450,11 @@ def reset_index( name: typing.Optional[str] = None, drop: bool = False, inplace: bool = False, + allow_duplicates: Optional[bool] = None, ) -> bigframes.dataframe.DataFrame | Series | None: - block = self._block.reset_index(level, drop) + if allow_duplicates is None: + allow_duplicates = False + block = self._block.reset_index(level, drop, allow_duplicates=allow_duplicates) if drop: if inplace: self._set_block(block) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index f752346bef..8a570ade45 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2085,6 +2085,32 @@ def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy() + scalars_df_index.index.name = "int64_col" + df = scalars_df_index.reset_index(allow_duplicates=True, drop=False) + assert df.index.name is None + + bf_result = df.to_pandas() + + scalars_pandas_df_index = scalars_pandas_df_index.copy() + scalars_pandas_df_index.index.name = "int64_col" + pd_result = scalars_pandas_df_index.reset_index(allow_duplicates=True, drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_duplicates_error(scalars_df_index): + scalars_df_index = scalars_df_index.copy() + scalars_df_index.index.name = "int64_col" + with pytest.raises(ValueError): + scalars_df_index.reset_index(allow_duplicates=False, drop=False) + + @pytest.mark.parametrize( ("drop",), ((True,), (False,)), diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 0c23ea97ae..f15b8d8b21 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -929,16 +929,30 @@ def test_column_multi_index_rename(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_column_multi_index_reset_index(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("names", "col_fill", "col_level"), + [ + (None, "", "l2"), + (("new_name"), "fill", 1), + ("new_name", "fill", 0), + ], +) +def test_column_multi_index_reset_index( + scalars_df_index, scalars_pandas_df_index, names, col_fill, col_level +): columns = ["int64_too", "int64_col", "float64_col"] - multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + multi_columns = pandas.MultiIndex.from_tuples( + zip(["a", "b", "a"], ["a", "b", "b"]), names=["l1", "l2"] + ) bf_df = scalars_df_index[columns].copy() bf_df.columns = multi_columns pd_df = scalars_pandas_df_index[columns].copy() pd_df.columns = multi_columns - bf_result = bf_df.reset_index().to_pandas() - pd_result = pd_df.reset_index() + bf_result = bf_df.reset_index( + names=names, col_fill=col_fill, col_level=col_level + ).to_pandas() + pd_result = pd_df.reset_index(names=names, col_fill=col_fill, col_level=col_level) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 2172962046..60a3d73dd4 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1339,6 +1339,32 @@ def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) +def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"].copy() + bf_series.index.name = "int64_col" + df = bf_series.reset_index(allow_duplicates=True, drop=False) + assert df.index.name is None + + bf_result = df.to_pandas() + + pd_series = scalars_pandas_df_index["int64_col"].copy() + pd_series.index.name = "int64_col" + pd_result = pd_series.reset_index(allow_duplicates=True, drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_series_reset_index_duplicates_error(scalars_df_index): + scalars_df_index = scalars_df_index["int64_col"].copy() + scalars_df_index.index.name = "int64_col" + with pytest.raises(ValueError): + scalars_df_index.reset_index(allow_duplicates=False, drop=False) + + def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.sort_index(ascending=False)["float64_col"] bf_result.reset_index(drop=True, inplace=True) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 00984935a4..44ca558070 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1605,6 +1605,10 @@ def reset_index( *, drop: bool = False, inplace: bool = False, + col_level: Hashable = 0, + col_fill: Hashable = "", + allow_duplicates: Optional[bool] = None, + names: Hashable | Sequence[Hashable] | None = None, ) -> DataFrame | None: """Reset the index. @@ -1706,6 +1710,19 @@ class name speed max the index to the default integer index. inplace (bool, default False): Whether to modify the DataFrame rather than creating a new one. + col_level (int or str, default 0): + If the columns have multiple levels, determines which level the + labels are inserted into. By default it is inserted into the first + level. + col_fill (object, default ''): + If the columns have multiple levels, determines how the other + levels are named. If None then the index name is repeated. + allow_duplicates (bool, optional, default None): + Allow duplicate column labels to be created. + names (str or 1-dimensional list, default None): + Using the given string, rename the DataFrame column which contains the + index data. If the DataFrame has a MultiIndex, this has to be a list or + tuple with length equal to the number of levels Returns: bigframes.pandas.DataFrame: DataFrame with the new index. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 7b420cf6e3..932959a826 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -326,6 +326,7 @@ def reset_index( drop: bool = False, name=pd_ext.no_default, inplace: bool = False, + allow_duplicates: Optional[bool] = None, ) -> DataFrame | Series | None: """ Generate a new DataFrame or Series with the index reset. @@ -413,6 +414,8 @@ def reset_index( when `drop` is True. inplace (bool, default False): Modify the Series in place (do not create a new object). + allow_duplicates (bool, optional, default None): + Allow duplicate column labels to be created. Returns: bigframes.pandas.Series or bigframes.pandas.DataFrame or None: