Skip to content

PERF: make Categorical _ndarray, attribute, _codes property #40033

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def __init__(
# infer categories in a factorization step further below

if fastpath:
self._codes = coerce_indexer_dtype(values, dtype.categories)
self._ndarray = coerce_indexer_dtype(values, dtype.categories)
self._dtype = self._dtype.update_dtype(dtype)
return

Expand Down Expand Up @@ -450,7 +450,7 @@ def __init__(
codes = full_codes

self._dtype = self._dtype.update_dtype(dtype)
self._codes = coerce_indexer_dtype(codes, dtype.categories)
self._ndarray = coerce_indexer_dtype(codes, dtype.categories)

@property
def dtype(self) -> CategoricalDtype:
Expand Down Expand Up @@ -923,7 +923,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal
codes = recode_for_categories(
cat.codes, cat.categories, new_dtype.categories
)
cat._codes = codes
cat._ndarray = codes
cat._dtype = new_dtype

if not inplace:
Expand Down Expand Up @@ -1096,7 +1096,7 @@ def add_categories(self, new_categories, inplace=False):

cat = self if inplace else self.copy()
cat._dtype = new_dtype
cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)
cat._ndarray = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)
if not inplace:
return cat

Expand Down Expand Up @@ -1201,7 +1201,7 @@ def remove_unused_categories(self, inplace=no_default):
new_categories, ordered=self.ordered
)
cat._dtype = new_dtype
cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)
cat._ndarray = coerce_indexer_dtype(inv, new_dtype.categories)

if not inplace:
return cat
Expand Down Expand Up @@ -1384,6 +1384,10 @@ def __setstate__(self, state):
if "_dtype" not in state:
state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])

if "_codes" in state and "_ndarray" not in state:
# backward compat, changed what is property vs attribute
state["_ndarray"] = state.pop("_codes")

for k, v in state.items():
setattr(self, k, v)

Expand Down Expand Up @@ -1785,11 +1789,11 @@ def fillna(self, value=None, method=None, limit=None):
# NDArrayBackedExtensionArray compat

@property
def _ndarray(self) -> np.ndarray:
return self._codes
def _codes(self) -> np.ndarray:
return self._ndarray

def _from_backing_data(self, arr: np.ndarray) -> Categorical:
return self._constructor(arr, dtype=self.dtype, fastpath=True)
return type(self)(arr, dtype=self.dtype, fastpath=True)

def _box_func(self, i: int):
if i == -1:
Expand All @@ -1800,7 +1804,7 @@ def _unbox_scalar(self, key) -> int:
# searchsorted is very performance sensitive. By converting codes
# to same dtype as self.codes, we get much faster performance.
code = self.categories.get_loc(key)
code = self._codes.dtype.type(code)
code = self._ndarray.dtype.type(code)
return code

# ------------------------------------------------------------------
Expand Down Expand Up @@ -2162,7 +2166,7 @@ def unique(self):
cat = self.copy()

# keep nan in codes
cat._codes = unique_codes
cat._ndarray = unique_codes

# exclude nan from indexer for categories
take_codes = unique_codes[unique_codes != -1]
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/categorical/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def test_engine_type(self, dtype, engine_type):
# having 2**32 - 2**31 categories would be very memory-intensive,
# so we cheat a bit with the dtype
ci = CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1)
ci.values._codes = ci.values._codes.astype("int64")
ci.values._ndarray = ci.values._ndarray.astype("int64")
assert np.issubdtype(ci.codes.dtype, dtype)
assert isinstance(ci._engine, engine_type)

Expand Down