Skip to content

ENH: Categorical.fillna allow Categorical/ndarray #32420

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pandas._libs import Timestamp, algos, hashtable as htable, lib
from pandas._libs.tslib import iNaT
from pandas._typing import AnyArrayLike
from pandas.util._decorators import doc

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -45,10 +46,14 @@
is_unsigned_integer_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
from pandas.core.dtypes.generic import (
ABCExtensionArray,
ABCIndex,
ABCIndexClass,
ABCSeries,
)
from pandas.core.dtypes.missing import isna, na_value_for_dtype

import pandas.core.common as com
from pandas.core.construction import array, extract_array
from pandas.core.indexers import validate_indices

Expand Down Expand Up @@ -384,7 +389,7 @@ def unique(values):
unique1d = unique


def isin(comps, values) -> np.ndarray:
def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
"""
Compute the isin boolean array.

Expand All @@ -409,15 +414,14 @@ def isin(comps, values) -> np.ndarray:
f"to isin(), you passed a [{type(values).__name__}]"
)

if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
values = construct_1d_object_array_from_listlike(list(values))

comps = extract_array(comps, extract_numpy=True)
if is_categorical_dtype(comps):
# TODO(extension)
# handle categoricals
return comps._values.isin(values)

comps = com.values_from_object(comps)
return comps.isin(values) # type: ignore
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this type ignore?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bc mypy doesnt know enough to rule out ndarray (or even non-Categorical EA) here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure i understand. shouldn't you just add

result = comps.isin(values)
assert isinstance(result, np.ndarray)
return result

prefer this to a type ignore

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the issue isnt type(result), its type(comps). Categorical has a isin method, but general ExtensionArray does not

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, is there a way to avoid the type ignore, i hate these things because we never remove them

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about an isinstance(ABCCategorical) ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or ok with a run-time import of Categorical if the above doesn't work;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mypy doesnt recognize any of our ABCFoos

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so you have an aversion to "# type: ignore" and i have an aversion to a runtime import just for mypy. is there a middle ground somewhere? maybe revisit in a few months to see if the type ignore can be removed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, let's do that, also let's create an issue to see if we can fix the ABC's and mypy


comps, dtype = _ensure_data(comps)
values, _ = _ensure_data(values, dtype=dtype)
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1738,12 +1738,17 @@ def fillna(self, value=None, method=None, limit=None):

# If value is a dict or a Series (a dict value has already
# been converted to a Series)
if isinstance(value, ABCSeries):
if not value[~value.isin(self.categories)].isna().all():
if isinstance(value, (np.ndarray, Categorical, ABCSeries)):
# We get ndarray or Categorical if called via Series.fillna,
# where it will unwrap another aligned Series before getting here

mask = ~algorithms.isin(value, self.categories)
if not isna(value[mask]).all():
raise ValueError("fill value must be in categories")

values_codes = _get_codes_for_values(value, self.categories)
indexer = np.where(codes == -1)
codes = codes.copy()
codes[indexer] = values_codes[indexer]

# If value is not a dict or Series it should be a scalar
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5988,6 +5988,8 @@ def fillna(
value = create_series_with_explicit_dtype(
value, dtype_if_empty=object
)
value = value.reindex(self.index, copy=False)
value = value._values
elif not is_list_like(value):
pass
else:
Expand Down
6 changes: 1 addition & 5 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ def apply(self, f, filter=None, **kwargs) -> "BlockManager":
BlockManager
"""
result_blocks = []
# fillna: Series/DataFrame is responsible for making sure value is aligned

# filter kwarg is used in replace-* family of methods
if filter is not None:
Expand All @@ -420,11 +421,6 @@ def apply(self, f, filter=None, **kwargs) -> "BlockManager":
align_keys = ["new", "mask"]
else:
align_keys = ["mask"]
elif f == "fillna":
# fillna internally does putmask, maybe it's better to do this
# at mgr, not block level?
align_copy = False
align_keys = ["value"]
else:
align_keys = []

Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/arrays/categorical/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,18 @@ def test_fillna_iterable_category(self, named):
expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])

tm.assert_categorical_equal(result, expected)

def test_fillna_array(self):
# accept Categorical or ndarray value if it holds appropriate values
cat = Categorical(["A", "B", "C", None, None])

other = cat.fillna("C")
result = cat.fillna(other)
tm.assert_categorical_equal(result, other)
assert isna(cat[-1]) # didnt modify original inplace

other = np.array(["A", "B", "C", "B", "A"])
result = cat.fillna(other)
expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
tm.assert_categorical_equal(result, expected)
assert isna(cat[-1]) # didnt modify original inplace
10 changes: 10 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,16 @@ def test_categorical_from_codes(self):
result = algos.isin(Sd, St)
tm.assert_numpy_array_equal(expected, result)

def test_categorical_isin(self):
vals = np.array([0, 1, 2, 0])
cats = ["a", "b", "c"]
cat = Categorical(1).from_codes(vals, cats)
other = Categorical(1).from_codes(np.array([0, 1]), cats)

expected = np.array([True, True, False, True])
result = algos.isin(cat, other)
tm.assert_numpy_array_equal(expected, result)

def test_same_nan_is_in(self):
# GH 22160
# nan is special, because from " a is b" doesn't follow "a == b"
Expand Down