-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
ENH: add DataFrame.is_unique method #37565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a835b37
f9d9e42
5472bbb
846b26d
2e02f10
47d2593
63643a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5372,6 +5372,59 @@ def drop_duplicates( | |
else: | ||
return result | ||
|
||
def is_unique( | ||
self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None | ||
) -> Series: | ||
""" | ||
Return boolean Series denoting which columns have unique values. | ||
|
||
.. versionadded:: 1.3.0 | ||
|
||
Parameters | ||
---------- | ||
subset : column label or sequence of labels, optional | ||
Only check subset of columns for uniques. By default checks all columns. | ||
|
||
Returns | ||
------- | ||
Series | ||
|
||
See Also | ||
-------- | ||
DataFrame.duplicated : Indicate duplicate rows. | ||
|
||
Examples | ||
-------- | ||
>>> df = pd.DataFrame([('falcon', 'bird', 389.0), | ||
... ('parrot', 'bird', 24.0), | ||
... ('lion', 'mammal', 80.5), | ||
... ('monkey', 'mammal', np.nan)], | ||
... columns=('name', 'class', 'max_speed')) | ||
>>> df | ||
name class max_speed | ||
0 falcon bird 389.0 | ||
1 parrot bird 24.0 | ||
2 lion mammal 80.5 | ||
3 monkey mammal NaN | ||
>>> df.is_unique() | ||
name True | ||
class False | ||
max_speed True | ||
dtype: bool | ||
>>> df.is_unique(["name", "class"]) | ||
name True | ||
class False | ||
dtype: bool | ||
""" | ||
if subset is not None: | ||
subset = com.maybe_make_list(subset) | ||
self = self[subset] | ||
|
||
if len(self.columns): | ||
return self.apply(lambda x: x.is_unique) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why does this not work on empties? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A dataframe with no columns doesn't call the inner func in apply (here |
||
else: | ||
return self._constructor_sliced(dtype=bool) | ||
|
||
def duplicated( | ||
self, | ||
subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, | ||
|
@@ -5405,6 +5458,7 @@ def duplicated( | |
Series.duplicated : Equivalent method on Series. | ||
Series.drop_duplicates : Remove duplicate values from Series. | ||
DataFrame.drop_duplicates : Remove duplicate values from DataFrame. | ||
DataFrame.is_unique : Indicate columns with unique values. | ||
|
||
Examples | ||
-------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import pytest | ||
|
||
from pandas import DataFrame, Series | ||
import pandas._testing as tm | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"frame, expected", | ||
[ | ||
# single column | ||
[DataFrame(), Series(dtype=bool)], | ||
[DataFrame({"a": ["x"]}), Series({"a": True})], | ||
[DataFrame({"a": ["x", "y"]}), Series({"a": True})], | ||
[DataFrame({"a": ["x", "x"]}), Series({"a": False})], | ||
[DataFrame({"a": ["x", "y", "y"]}), Series({"a": False})], | ||
# multiple columns | ||
[DataFrame(columns=["a", "b"]), Series({"a": True, "b": True})], | ||
[DataFrame({"a": ["x"], "b": ["y"]}), Series({"a": True, "b": True})], | ||
[ | ||
DataFrame({"a": ["x", "y"], "b": ["x", "x"]}), | ||
Series({"a": True, "b": False}), | ||
], | ||
# multiple columns, same column name | ||
[DataFrame(columns=["a", "a"]), Series([True, True], index=["a", "a"])], | ||
[ | ||
DataFrame([["x", "y"]], columns=["a", "a"]), | ||
Series([True, True], index=["a", "a"]), | ||
], | ||
[ | ||
DataFrame([["x", "y"], ["y", "y"]], columns=["a", "a"]), | ||
Series([True, False], index=["a", "a"]), | ||
], | ||
], | ||
) | ||
def test_is_unique(frame, expected): | ||
# GH37565 | ||
result = frame.is_unique() | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"frame, subset, expected", | ||
[ | ||
[DataFrame(columns=["a", "b"]), ["a"], Series({"a": True})], | ||
[DataFrame({"a": ["x"], "b": ["y"]}), "a", Series({"a": True})], | ||
[DataFrame({"a": ["x"], "b": ["y"]}), ["a"], Series({"a": True})], | ||
[ | ||
DataFrame({"a": ["x", "y"], "b": ["x", "x"]}), | ||
["a", "b"], | ||
Series({"a": True, "b": False}), | ||
], | ||
], | ||
) | ||
def test_is_unique_subsetting(frame, subset, expected): | ||
# GH37565 | ||
result = frame.is_unique(subset=subset) | ||
tm.assert_series_equal(result, expected) |
Uh oh!
There was an error while loading. Please reload this page.