pandas-dev · phofl · Sep 2, 2023 · Sep 2, 2023 · Sep 3, 2023 · Sep 3, 2023
diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -35,6 +35,7 @@ objects.
       api.extensions.ExtensionArray._accumulate
       api.extensions.ExtensionArray._concat_same_type
       api.extensions.ExtensionArray._explode
+      api.extensions.ExtensionArray._factorize_with_other_for_merge
       api.extensions.ExtensionArray._formatter
       api.extensions.ExtensionArray._from_factorized
       api.extensions.ExtensionArray._from_sequence

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -100,6 +100,7 @@ Other enhancements
 - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
 - :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
 - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
+- :meth:`ExtensionArray._factorize_with_other_for_merge` interface method added to allow efficient extension type factorize implementations for merge (:issue:`54975`)
 - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
 - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
 - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -39,6 +39,7 @@
     is_integer,
     is_list_like,
     is_scalar,
+    needs_i8_conversion,
 )
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.core.dtypes.missing import isna
@@ -62,6 +63,7 @@
     unpack_tuple_and_ellipses,
     validate_indices,
 )
+from pandas.core.reshape.merge_utils import factorize_arrays
 from pandas.core.strings.base import BaseStringArrayMethods
 
 from pandas.io._util import _arrow_dtype_mapping
@@ -2625,6 +2627,48 @@ def _dt_tz_convert(self, tz):
         result = self._pa_array.cast(pa.timestamp(current_unit, tz))
         return type(self)(result)
 
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        if not isinstance(self.dtype, StringDtype) and (
+            pa.types.is_floating(self.dtype.pyarrow_dtype)
+            or pa.types.is_integer(self.dtype.pyarrow_dtype)
+            or pa.types.is_unsigned_integer(self.dtype.pyarrow_dtype)
+            or pa.types.is_duration(self.dtype.pyarrow_dtype)
+            or pa.types.is_date(self.dtype.pyarrow_dtype)
+            or pa.types.is_timestamp(self.dtype.pyarrow_dtype)
+        ):
+            # Keep using current logic
+            # TODO: check if new implementation with dictionary_encode is faster
+            lk = self.to_numpy(na_value=1, dtype=self.dtype.numpy_dtype)
+            rk = other.to_numpy(na_value=1, dtype=lk.dtype)
+
+            if needs_i8_conversion(lk.dtype):
+                lk = np.asarray(lk, dtype=np.int64)
+                rk = np.asarray(rk, dtype=np.int64)
+
+            return factorize_arrays(lk, rk, sort, self.isna(), other.isna())
+
+        len_left = len(self)
+        left = self._pa_array
+        right = other._pa_array
+        dc = (
+            pa.chunked_array(left.chunks + right.chunks)
+            .combine_chunks()
+            .dictionary_encode()
+        )
+        length = len(dc.dictionary)
+
+        llab = pc.fill_null(dc.indices[slice(len_left)], length)
+        llab = llab.to_numpy().astype(np.intp, copy=False)
+        rlab = pc.fill_null(dc.indices[slice(len_left, None)], length)
+        rlab = rlab.to_numpy().astype(np.intp, copy=False)
+        count = len(dc.dictionary)
+
+        if dc.null_count > 0:
+            count += 1
+        return llab, rlab, count
+
 
 def transpose_homogeneous_pyarrow(
     arrays: Sequence[ArrowExtensionArray],

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -70,6 +70,7 @@
     unique,
 )
 from pandas.core.array_algos.quantile import quantile_with_mask
+from pandas.core.reshape.merge_utils import factorize_arrays
 from pandas.core.sorting import (
     nargminmax,
     nargsort,
@@ -2383,6 +2384,33 @@ def _groupby_op(
         else:
             raise NotImplementedError
 
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        """
+        Factorize two arrays to get codes for merge operations.
+
+        This allows extension array authors to implement efficient factorizations
+        for merge operations.
+
+        Parameters
+        ----------
+        other : ExtensionArray with the same dtype as self
+        sort : Whether to sort the result
+
+        Returns
+        -------
+        tuple of codes for left and right and the number of unique elements.
+
+        Examples
+        --------
+        This is a factorize of 2 different arrays.
+        """
+        # self and other have equal dtypes after _values_for_factorize
+        lk, _ = self._values_for_factorize()
+        rk, _ = other._values_for_factorize()
+        return factorize_arrays(lk, rk, sort)
+
 
 class ExtensionArraySupportsAnyAll(ExtensionArray):
     def any(self, *, skipna: bool = True) -> bool:

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -84,6 +84,7 @@
     sanitize_array,
 )
 from pandas.core.ops.common import unpack_zerodim_and_defer
+from pandas.core.reshape.merge_utils import factorize_arrays
 from pandas.core.sorting import nargsort
 from pandas.core.strings.object_array import ObjectStringArrayMixin
 
@@ -2749,6 +2750,12 @@ def _groupby_op(
             res_values[result_mask == 1] = -1
         return self._from_backing_data(res_values)
 
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        rk = self._encode_with_my_categories(other)
+        return factorize_arrays(self.codes, rk.codes, sort)
+
 
 # The Series.cat accessor
 

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -141,6 +141,7 @@
     invalid_comparison,
     make_invalid_op,
 )
+from pandas.core.reshape.merge_utils import factorize_arrays
 
 from pandas.tseries import frequencies
 
@@ -1705,6 +1706,14 @@ def _groupby_op(
         res_values = res_values.view(self._ndarray.dtype)
         return self._from_backing_data(res_values)
 
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        # caller is responsible for ensuring self.dtype == other.dtype
+        lk = np.asarray(self._ndarray, dtype=np.int64)
+        rk = np.asarray(other._ndarray, dtype=np.int64)
+        return factorize_arrays(lk, rk, sort)
+
 
 class DatelikeOps(DatetimeLikeArrayMixin):
     """

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -86,6 +86,7 @@
 )
 from pandas.core.indexers import check_array_indexer
 from pandas.core.ops import invalid_comparison
+from pandas.core.reshape.merge_utils import factorize_arrays
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -1524,6 +1525,11 @@ def _groupby_op(
             #  wrap in a MaskedArray
             return self._maybe_mask_result(res_values, result_mask)
 
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        return factorize_arrays(self._data, other._data, sort, self._mask, other._mask)
+
 
 def transpose_homogeneous_masked_arrays(
     masked_arrays: Sequence[BaseMaskedArray],