Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,12 +387,21 @@ def reversed(self) -> Block:
index_labels=self.index.names,
)

def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block:
def reset_index(
self,
level: LevelsType = None,
drop: bool = True,
*,
col_level: Union[str, int] = 0,
col_fill: typing.Hashable = "",
allow_duplicates: bool = False,
) -> Block:
"""Reset the index of the block, promoting the old index to a value column.

Arguments:
level: the label or index level of the index levels to remove.
name: this is the column id for the new value id derived from the old index
allow_duplicates:

Returns:
A new Block because dropping index columns can break references
Expand Down Expand Up @@ -438,6 +447,11 @@ def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block:
)
else:
# Add index names to column index
col_level_n = (
col_level
if isinstance(col_level, int)
else self.column_labels.names.index(col_level)
)
column_labels_modified = self.column_labels
for position, level_id in enumerate(level_ids):
label = self.col_id_to_index_name[level_id]
Expand All @@ -447,11 +461,15 @@ def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block:
else:
label = f"level_{self.index_columns.index(level_id)}"

if label in self.column_labels:
if (not allow_duplicates) and (label in self.column_labels):
raise ValueError(f"cannot insert {label}, already exists")

if isinstance(self.column_labels, pd.MultiIndex):
nlevels = self.column_labels.nlevels
label = tuple(label if i == 0 else "" for i in range(nlevels))
label = tuple(
label if i == col_level_n else col_fill for i in range(nlevels)
)

# Create index copy with label inserted
# See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html
column_labels_modified = column_labels_modified.insert(position, label)
Expand Down
47 changes: 44 additions & 3 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2321,6 +2321,10 @@ def reset_index(
level: blocks.LevelsType = ...,
drop: bool = ...,
inplace: Literal[False] = ...,
col_level: Union[int, str] = ...,
col_fill: Hashable = ...,
allow_duplicates: Optional[bool] = ...,
names: Union[None, Hashable, Sequence[Hashable]] = ...,
) -> DataFrame:
...

Expand All @@ -2330,19 +2334,56 @@ def reset_index(
level: blocks.LevelsType = ...,
drop: bool = ...,
inplace: Literal[True] = ...,
col_level: Union[int, str] = ...,
col_fill: Hashable = ...,
allow_duplicates: Optional[bool] = ...,
names: Union[None, Hashable, Sequence[Hashable]] = ...,
) -> None:
...

@overload
def reset_index(
self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = ...
self,
level: blocks.LevelsType = None,
drop: bool = False,
inplace: bool = ...,
col_level: Union[int, str] = ...,
col_fill: Hashable = ...,
allow_duplicates: Optional[bool] = ...,
names: Union[None, Hashable, Sequence[Hashable]] = ...,
) -> Optional[DataFrame]:
...

def reset_index(
self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = False
self,
level: blocks.LevelsType = None,
drop: bool = False,
inplace: bool = False,
col_level: Union[int, str] = 0,
col_fill: Hashable = "",
allow_duplicates: Optional[bool] = None,
names: Union[None, Hashable, Sequence[Hashable]] = None,
) -> Optional[DataFrame]:
block = self._block.reset_index(level, drop)
block = self._block
if names:
if isinstance(names, blocks.Label) and not isinstance(names, tuple):
names = [names]
else:
names = list(names)

if len(names) != self.index.nlevels:
raise ValueError("'names' must be same length as levels")

block = block.with_index_labels(names)
if allow_duplicates is None:
allow_duplicates = False
block = block.reset_index(
level,
drop,
col_level=col_level,
col_fill=col_fill,
allow_duplicates=allow_duplicates,
)
if inplace:
self._set_block(block)
return None
Expand Down
8 changes: 7 additions & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,7 @@ def reset_index(
name: typing.Optional[str] = ...,
drop: Literal[False] = ...,
inplace: Literal[False] = ...,
allow_duplicates: Optional[bool] = ...,
) -> bigframes.dataframe.DataFrame:
...

Expand All @@ -425,6 +426,7 @@ def reset_index(
name: typing.Optional[str] = ...,
drop: Literal[True] = ...,
inplace: Literal[False] = ...,
allow_duplicates: Optional[bool] = ...,
) -> Series:
...

Expand All @@ -436,6 +438,7 @@ def reset_index(
name: typing.Optional[str] = ...,
drop: bool = ...,
inplace: Literal[True] = ...,
allow_duplicates: Optional[bool] = ...,
) -> None:
...

Expand All @@ -447,8 +450,11 @@ def reset_index(
name: typing.Optional[str] = None,
drop: bool = False,
inplace: bool = False,
allow_duplicates: Optional[bool] = None,
) -> bigframes.dataframe.DataFrame | Series | None:
block = self._block.reset_index(level, drop)
if allow_duplicates is None:
allow_duplicates = False
block = self._block.reset_index(level, drop, allow_duplicates=allow_duplicates)
if drop:
if inplace:
self._set_block(block)
Expand Down
26 changes: 26 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2085,6 +2085,32 @@ def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop):
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index):
scalars_df_index = scalars_df_index.copy()
scalars_df_index.index.name = "int64_col"
df = scalars_df_index.reset_index(allow_duplicates=True, drop=False)
assert df.index.name is None

bf_result = df.to_pandas()

scalars_pandas_df_index = scalars_pandas_df_index.copy()
scalars_pandas_df_index.index.name = "int64_col"
pd_result = scalars_pandas_df_index.reset_index(allow_duplicates=True, drop=False)

# Pandas uses int64 instead of Int64 (nullable) dtype.
pd_result.index = pd_result.index.astype(pd.Int64Dtype())

# reset_index should maintain the original ordering.
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_reset_index_duplicates_error(scalars_df_index):
scalars_df_index = scalars_df_index.copy()
scalars_df_index.index.name = "int64_col"
with pytest.raises(ValueError):
scalars_df_index.reset_index(allow_duplicates=False, drop=False)


@pytest.mark.parametrize(
("drop",),
((True,), (False,)),
Expand Down
22 changes: 18 additions & 4 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,16 +929,30 @@ def test_column_multi_index_rename(scalars_df_index, scalars_pandas_df_index):
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_column_multi_index_reset_index(scalars_df_index, scalars_pandas_df_index):
@pytest.mark.parametrize(
("names", "col_fill", "col_level"),
[
(None, "", "l2"),
(("new_name"), "fill", 1),
("new_name", "fill", 0),
],
)
def test_column_multi_index_reset_index(
scalars_df_index, scalars_pandas_df_index, names, col_fill, col_level
):
columns = ["int64_too", "int64_col", "float64_col"]
multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
multi_columns = pandas.MultiIndex.from_tuples(
zip(["a", "b", "a"], ["a", "b", "b"]), names=["l1", "l2"]
)
bf_df = scalars_df_index[columns].copy()
bf_df.columns = multi_columns
pd_df = scalars_pandas_df_index[columns].copy()
pd_df.columns = multi_columns

bf_result = bf_df.reset_index().to_pandas()
pd_result = pd_df.reset_index()
bf_result = bf_df.reset_index(
names=names, col_fill=col_fill, col_level=col_level
).to_pandas()
pd_result = pd_df.reset_index(names=names, col_fill=col_fill, col_level=col_level)

# Pandas uses int64 instead of Int64 (nullable) dtype.
pd_result.index = pd_result.index.astype(pandas.Int64Dtype())
Expand Down
26 changes: 26 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,32 @@ def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index):
pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)


def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index):
bf_series = scalars_df_index["int64_col"].copy()
bf_series.index.name = "int64_col"
df = bf_series.reset_index(allow_duplicates=True, drop=False)
assert df.index.name is None

bf_result = df.to_pandas()

pd_series = scalars_pandas_df_index["int64_col"].copy()
pd_series.index.name = "int64_col"
pd_result = pd_series.reset_index(allow_duplicates=True, drop=False)

# Pandas uses int64 instead of Int64 (nullable) dtype.
pd_result.index = pd_result.index.astype(pd.Int64Dtype())

# reset_index should maintain the original ordering.
pd.testing.assert_frame_equal(bf_result, pd_result)


def test_series_reset_index_duplicates_error(scalars_df_index):
scalars_df_index = scalars_df_index["int64_col"].copy()
scalars_df_index.index.name = "int64_col"
with pytest.raises(ValueError):
scalars_df_index.reset_index(allow_duplicates=False, drop=False)


def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index):
bf_result = scalars_df_index.sort_index(ascending=False)["float64_col"]
bf_result.reset_index(drop=True, inplace=True)
Expand Down
17 changes: 17 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,6 +1605,10 @@ def reset_index(
*,
drop: bool = False,
inplace: bool = False,
col_level: Hashable = 0,
col_fill: Hashable = "",
allow_duplicates: Optional[bool] = None,
names: Hashable | Sequence[Hashable] | None = None,
) -> DataFrame | None:
"""Reset the index.

Expand Down Expand Up @@ -1706,6 +1710,19 @@ class name speed max
the index to the default integer index.
inplace (bool, default False):
Whether to modify the DataFrame rather than creating a new one.
col_level (int or str, default 0):
If the columns have multiple levels, determines which level the
labels are inserted into. By default it is inserted into the first
level.
col_fill (object, default ''):
If the columns have multiple levels, determines how the other
levels are named. If None then the index name is repeated.
allow_duplicates (bool, optional, default None):
Allow duplicate column labels to be created.
names (str or 1-dimensional list, default None):
Using the given string, rename the DataFrame column which contains the
index data. If the DataFrame has a MultiIndex, this has to be a list or
tuple with length equal to the number of levels

Returns:
bigframes.pandas.DataFrame: DataFrame with the new index.
Expand Down
3 changes: 3 additions & 0 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ def reset_index(
drop: bool = False,
name=pd_ext.no_default,
inplace: bool = False,
allow_duplicates: Optional[bool] = None,
) -> DataFrame | Series | None:
"""
Generate a new DataFrame or Series with the index reset.
Expand Down Expand Up @@ -413,6 +414,8 @@ def reset_index(
when `drop` is True.
inplace (bool, default False):
Modify the Series in place (do not create a new object).
allow_duplicates (bool, optional, default None):
Allow duplicate column labels to be created.

Returns:
bigframes.pandas.Series or bigframes.pandas.DataFrame or None:
Expand Down