Skip to content

ENH: Move factorizing for merge to EA interface #54975

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ objects.
api.extensions.ExtensionArray._accumulate
api.extensions.ExtensionArray._concat_same_type
api.extensions.ExtensionArray._explode
api.extensions.ExtensionArray._factorize_with_other_for_merge
api.extensions.ExtensionArray._formatter
api.extensions.ExtensionArray._from_factorized
api.extensions.ExtensionArray._from_sequence
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ Other enhancements
- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
- :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray._factorize_with_other_for_merge` interface method added to allow efficient extension type factorize implementations for merge (:issue:`54975`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
Expand Down
44 changes: 44 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
is_integer,
is_list_like,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna
Expand All @@ -62,6 +63,7 @@
unpack_tuple_and_ellipses,
validate_indices,
)
from pandas.core.reshape.merge_utils import factorize_arrays
from pandas.core.strings.base import BaseStringArrayMethods

from pandas.io._util import _arrow_dtype_mapping
Expand Down Expand Up @@ -2625,6 +2627,48 @@ def _dt_tz_convert(self, tz):
result = self._pa_array.cast(pa.timestamp(current_unit, tz))
return type(self)(result)

def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
if not isinstance(self.dtype, StringDtype) and (
pa.types.is_floating(self.dtype.pyarrow_dtype)
or pa.types.is_integer(self.dtype.pyarrow_dtype)
or pa.types.is_unsigned_integer(self.dtype.pyarrow_dtype)
or pa.types.is_duration(self.dtype.pyarrow_dtype)
or pa.types.is_date(self.dtype.pyarrow_dtype)
or pa.types.is_timestamp(self.dtype.pyarrow_dtype)
):
# Keep using current logic
# TODO: check if new implementation with dictionary_encode is faster
lk = self.to_numpy(na_value=1, dtype=self.dtype.numpy_dtype)
rk = other.to_numpy(na_value=1, dtype=lk.dtype)

if needs_i8_conversion(lk.dtype):
lk = np.asarray(lk, dtype=np.int64)
rk = np.asarray(rk, dtype=np.int64)

return factorize_arrays(lk, rk, sort, self.isna(), other.isna())

len_left = len(self)
left = self._pa_array
right = other._pa_array
dc = (
pa.chunked_array(left.chunks + right.chunks)
.combine_chunks()
.dictionary_encode()
)
length = len(dc.dictionary)

llab = pc.fill_null(dc.indices[slice(len_left)], length)
llab = llab.to_numpy().astype(np.intp, copy=False)
rlab = pc.fill_null(dc.indices[slice(len_left, None)], length)
rlab = rlab.to_numpy().astype(np.intp, copy=False)
count = len(dc.dictionary)

if dc.null_count > 0:
count += 1
return llab, rlab, count


def transpose_homogeneous_pyarrow(
arrays: Sequence[ArrowExtensionArray],
Expand Down
28 changes: 28 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
unique,
)
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.reshape.merge_utils import factorize_arrays
from pandas.core.sorting import (
nargminmax,
nargsort,
Expand Down Expand Up @@ -2383,6 +2384,33 @@ def _groupby_op(
else:
raise NotImplementedError

def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
"""
Factorize two arrays to get codes for merge operations.

This allows extension array authors to implement efficient factorizations
for merge operations.

Parameters
----------
other : ExtensionArray with the same dtype as self
sort : Whether to sort the result

Returns
-------
tuple of codes for left and right and the number of unique elements.

Examples
--------
This is a factorize of 2 different arrays.
"""
# self and other have equal dtypes after _values_for_factorize
lk, _ = self._values_for_factorize()
rk, _ = other._values_for_factorize()
return factorize_arrays(lk, rk, sort)


class ExtensionArraySupportsAnyAll(ExtensionArray):
def any(self, *, skipna: bool = True) -> bool:
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
sanitize_array,
)
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.reshape.merge_utils import factorize_arrays
from pandas.core.sorting import nargsort
from pandas.core.strings.object_array import ObjectStringArrayMixin

Expand Down Expand Up @@ -2749,6 +2750,12 @@ def _groupby_op(
res_values[result_mask == 1] = -1
return self._from_backing_data(res_values)

def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
rk = self._encode_with_my_categories(other)
return factorize_arrays(self.codes, rk.codes, sort)


# The Series.cat accessor

Expand Down
9 changes: 9 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@
invalid_comparison,
make_invalid_op,
)
from pandas.core.reshape.merge_utils import factorize_arrays

from pandas.tseries import frequencies

Expand Down Expand Up @@ -1705,6 +1706,14 @@ def _groupby_op(
res_values = res_values.view(self._ndarray.dtype)
return self._from_backing_data(res_values)

def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
# caller is responsible for ensuring self.dtype == other.dtype
lk = np.asarray(self._ndarray, dtype=np.int64)
rk = np.asarray(other._ndarray, dtype=np.int64)
return factorize_arrays(lk, rk, sort)


class DatelikeOps(DatetimeLikeArrayMixin):
"""
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
)
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.reshape.merge_utils import factorize_arrays

if TYPE_CHECKING:
from collections.abc import (
Expand Down Expand Up @@ -1524,6 +1525,11 @@ def _groupby_op(
# wrap in a MaskedArray
return self._maybe_mask_result(res_values, result_mask)

def _factorize_with_other_for_merge(
self, other: Self, sort: bool = False
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
return factorize_arrays(self._data, other._data, sort, self._mask, other._mask)


def transpose_homogeneous_masked_arrays(
masked_arrays: Sequence[BaseMaskedArray],
Expand Down
Loading