diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 9b4b5e4290..6da68e2e8f 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -451,6 +451,12 @@ def drop_duplicates(self, *, keep: str = "first") -> Index: block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep) return Index(block) + def unique(self, level: Hashable | int | None = None) -> Index: + if level is None: + return self.drop_duplicates() + + return self.get_level_values(level).drop_duplicates() + def isin(self, values) -> Index: if not utils.is_list_like(values): raise TypeError( diff --git a/tests/system/small/core/indexes/test_base.py b/tests/system/small/core/indexes/test_base.py new file mode 100644 index 0000000000..05ea40cfb9 --- /dev/null +++ b/tests/system/small/core/indexes/test_base.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from packaging import version +import pandas as pd +import pandas.testing +import pytest + + +@pytest.mark.parametrize("level", [None, 0, 1, "level0", "level1"]) +def test_unique(session, level): + if version.Version(pd.__version__) < version.Version("2.0.0"): + pytest.skip("StringDtype for multi-index not supported until Pandas 2.0") + arrays = [ + pd.Series(["A", "A", "B", "B", "A"], dtype=pd.StringDtype(storage="pyarrow")), + pd.Series([1, 2, 1, 2, 1], dtype=pd.Int64Dtype()), + ] + pd_idx = pd.MultiIndex.from_arrays(arrays, names=["level0", "level1"]) + bf_idx = session.read_pandas(pd_idx) + + actual_result = bf_idx.unique(level).to_pandas() + + expected_result = pd_idx.unique(level) + pandas.testing.assert_index_equal(actual_result, expected_result) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index c94f707671..be1c5034f9 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1,6 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/base.py from __future__ import annotations +from collections.abc import Hashable import typing from bigframes import constants @@ -1061,6 +1062,28 @@ def drop_duplicates(self, *, keep: str = "first"): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unique(self, level: Hashable | int | None = None): + """ + Returns unique values in the index. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> idx = bpd.Index([1, 1, 2, 3, 3]) + >>> idx.unique() + Index([1, 2, 3], dtype='Int64') + + Args: + level (int or hashable, optional): + Only return values from specified level (for MultiIndex). + If int, gets the level by integer position, else by level name. + + Returns: + bigframes.pandas.Index + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype, *, allow_large_results=None): """ A NumPy ndarray representing the values in this Series or Index.