From 917e50bceff22084eb0073d3a9021e581e6c3f6f Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 3 Sep 2023 01:30:39 +0200
Subject: [PATCH 01/16] ENH: Move factorizing for merge to EA interface

---
 pandas/core/arrays/arrow/array.py  |  34 ++++++
 pandas/core/arrays/base.py         |  26 +++++
 pandas/core/arrays/categorical.py  |  10 ++
 pandas/core/arrays/masked.py       |   8 ++
 pandas/core/reshape/merge.py       | 162 ++---------------------------
 pandas/core/reshape/merge_utils.py |  66 ++++++++++++
 6 files changed, 155 insertions(+), 151 deletions(-)
 create mode 100644 pandas/core/reshape/merge_utils.py

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 4d887ecd1510f..eed1fe0002fb1 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -58,6 +58,7 @@
     unpack_tuple_and_ellipses,
     validate_indices,
 )
+from pandas.core.reshape.merge_utils import factorize_with_rizer
 from pandas.core.strings.base import BaseStringArrayMethods
 
 from pandas.io._util import _arrow_dtype_mapping
@@ -2553,6 +2554,39 @@ def _dt_tz_convert(self, tz):
         result = self._pa_array.cast(pa.timestamp(current_unit, tz))
         return type(self)(result)
 
+    def _factorize_with_other(
+        self, other: ArrowExtensionArray, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        if not isinstance(self.dtype, StringDtype) and (
+            pa.types.is_floating(self.dtype.pyarrow_dtype)
+            or pa.types.is_integer(self.dtype.pyarrow_dtype)
+            or pa.types.is_unsigned_integer(self.dtype.pyarrow_dtype)
+        ):
+            lk = self.to_numpy(na_value=1, dtype=self.dtype.numpy_dtype)
+            rk = other.to_numpy(na_value=1, dtype=lk.dtype)
+            return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna())
+
+        len_lk = len(self)
+        lk = self._pa_array  # type: ignore[attr-defined]
+        rk = other._pa_array  # type: ignore[union-attr]
+        dc = (
+            pa.chunked_array(lk.chunks + rk.chunks)  # type: ignore[union-attr]
+            .combine_chunks()
+            .dictionary_encode()
+        )
+        length = len(dc.dictionary)
+
+        llab, rlab, count = (
+            pc.fill_null(dc.indices[slice(len_lk)], length)
+            .to_numpy()
+            .astype(np.intp, copy=False),
+            pc.fill_null(dc.indices[slice(len_lk, None)], length)
+            .to_numpy()
+            .astype(np.intp, copy=False),
+            len(dc.dictionary),
+        )
+        return llab, rlab, count
+
 
 def transpose_homogeneous_pyarrow(
     arrays: Sequence[ArrowExtensionArray],
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index f3bb7323c7d5f..171e1c29c550b 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -24,6 +24,7 @@
 
 from pandas._libs import (
     algos as libalgos,
+    hashtable as libhashtable,
     lib,
 )
 from pandas.compat import set_function_name
@@ -69,6 +70,7 @@
     unique,
 )
 from pandas.core.array_algos.quantile import quantile_with_mask
+from pandas.core.reshape.merge_utils import factorize_with_rizer
 from pandas.core.sorting import (
     nargminmax,
     nargsort,
@@ -102,6 +104,23 @@
     from pandas import Index
 
 _extension_array_shared_docs: dict[str, str] = {}
+_factorizers = {
+    np.int64: libhashtable.Int64Factorizer,
+    np.longlong: libhashtable.Int64Factorizer,
+    np.int32: libhashtable.Int32Factorizer,
+    np.int16: libhashtable.Int16Factorizer,
+    np.int8: libhashtable.Int8Factorizer,
+    np.uint64: libhashtable.UInt64Factorizer,
+    np.uint32: libhashtable.UInt32Factorizer,
+    np.uint16: libhashtable.UInt16Factorizer,
+    np.uint8: libhashtable.UInt8Factorizer,
+    np.bool_: libhashtable.UInt8Factorizer,
+    np.float64: libhashtable.Float64Factorizer,
+    np.float32: libhashtable.Float32Factorizer,
+    np.complex64: libhashtable.Complex64Factorizer,
+    np.complex128: libhashtable.Complex128Factorizer,
+    np.object_: libhashtable.ObjectFactorizer,
+}
 
 
 class ExtensionArray:
@@ -2266,6 +2285,13 @@ def _groupby_op(
         else:
             raise NotImplementedError
 
+    def _factorize_with_other(
+        self, other: ExtensionArray, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        lk = self.astype(object)
+        rk = other.astype(object)
+        return factorize_with_rizer(lk, rk, sort)
+
 
 class ExtensionArraySupportsAnyAll(ExtensionArray):
     def any(self, *, skipna: bool = True) -> bool:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 9f63d1f97c54f..31ba73d340b72 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -84,6 +84,7 @@
     sanitize_array,
 )
 from pandas.core.ops.common import unpack_zerodim_and_defer
+from pandas.core.reshape.merge_utils import factorize_with_rizer
 from pandas.core.sorting import nargsort
 from pandas.core.strings.object_array import ObjectStringArrayMixin
 
@@ -2717,6 +2718,15 @@ def _groupby_op(
             res_values[result_mask == 1] = -1
         return self._from_backing_data(res_values)
 
+    def _factorize_with_other(
+        self, other: Categorical, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        rk = self._encode_with_my_categories(other)
+
+        lk = ensure_int64(self.codes)
+        rk = ensure_int64(rk.codes)
+        return factorize_with_rizer(lk, rk, sort)
+
 
 # The Series.cat accessor
 
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 2cf28c28427ab..e5e8648446cd9 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -85,6 +85,7 @@
 )
 from pandas.core.indexers import check_array_indexer
 from pandas.core.ops import invalid_comparison
+from pandas.core.reshape.merge_utils import factorize_with_rizer
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -1505,6 +1506,13 @@ def _groupby_op(
         #  wrap in a MaskedArray
         return self._maybe_mask_result(res_values, result_mask)
 
+    def _factorize_with_other(
+        self, other: BaseMaskedArray, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        return factorize_with_rizer(
+            self._data, other._data, sort, self._mask, other._mask
+        )
+
 
 def transpose_homogeneous_masked_arrays(
     masked_arrays: Sequence[BaseMaskedArray],
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 8ef3943ab0d8d..126bf61ddae54 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -23,7 +23,6 @@
 
 from pandas._libs import (
     Timedelta,
-    hashtable as libhashtable,
     join as libjoin,
     lib,
 )
@@ -49,7 +48,6 @@
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.cast import find_common_type
 from pandas.core.dtypes.common import (
-    ensure_int64,
     ensure_object,
     is_bool,
     is_bool_dtype,
@@ -61,7 +59,6 @@
     is_number,
     is_numeric_dtype,
     is_object_dtype,
-    is_string_dtype,
     needs_i8_conversion,
 )
 from pandas.core.dtypes.dtypes import (
@@ -78,7 +75,6 @@
 )
 
 from pandas import (
-    ArrowDtype,
     Categorical,
     Index,
     MultiIndex,
@@ -91,7 +87,6 @@
     ExtensionArray,
 )
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-from pandas.core.arrays.string_ import StringDtype
 import pandas.core.common as com
 from pandas.core.construction import (
     ensure_wrapped_if_datetimelike,
@@ -99,6 +94,7 @@
 )
 from pandas.core.frame import _merge_doc
 from pandas.core.indexes.api import default_index
+from pandas.core.reshape.merge_utils import factorize_with_rizer
 from pandas.core.sorting import is_int64_overflow_possible
 
 if TYPE_CHECKING:
@@ -107,27 +103,6 @@
     from pandas.core.arrays import DatetimeArray
     from pandas.core.indexes.frozen import FrozenList
 
-_factorizers = {
-    np.int64: libhashtable.Int64Factorizer,
-    np.longlong: libhashtable.Int64Factorizer,
-    np.int32: libhashtable.Int32Factorizer,
-    np.int16: libhashtable.Int16Factorizer,
-    np.int8: libhashtable.Int8Factorizer,
-    np.uint64: libhashtable.UInt64Factorizer,
-    np.uint32: libhashtable.UInt32Factorizer,
-    np.uint16: libhashtable.UInt16Factorizer,
-    np.uint8: libhashtable.UInt8Factorizer,
-    np.bool_: libhashtable.UInt8Factorizer,
-    np.float64: libhashtable.Float64Factorizer,
-    np.float32: libhashtable.Float32Factorizer,
-    np.complex64: libhashtable.Complex64Factorizer,
-    np.complex128: libhashtable.Complex128Factorizer,
-    np.object_: libhashtable.ObjectFactorizer,
-}
-
-# See https://github.com/pandas-dev/pandas/issues/52451
-if np.intc is not np.int32:
-    _factorizers[np.intc] = libhashtable.Int64Factorizer
 
 _known = (np.ndarray, ExtensionArray, Index, ABCSeries)
 
@@ -2405,64 +2380,12 @@ def _factorize_keys(
         lk = cast("DatetimeArray", lk)._ndarray
         rk = cast("DatetimeArray", rk)._ndarray
 
-    elif (
-        isinstance(lk.dtype, CategoricalDtype)
-        and isinstance(rk.dtype, CategoricalDtype)
-        and lk.dtype == rk.dtype
-    ):
-        assert isinstance(lk, Categorical)
-        assert isinstance(rk, Categorical)
-        # Cast rk to encoding so we can compare codes with lk
-
-        rk = lk._encode_with_my_categories(rk)
-
-        lk = ensure_int64(lk.codes)
-        rk = ensure_int64(rk.codes)
-
     elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
-        if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
-            isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
-        ):
-            import pyarrow as pa
-            import pyarrow.compute as pc
-
-            len_lk = len(lk)
-            lk = lk._pa_array  # type: ignore[attr-defined]
-            rk = rk._pa_array  # type: ignore[union-attr]
-            dc = (
-                pa.chunked_array(lk.chunks + rk.chunks)  # type: ignore[union-attr]
-                .combine_chunks()
-                .dictionary_encode()
-            )
-            length = len(dc.dictionary)
-
-            llab, rlab, count = (
-                pc.fill_null(dc.indices[slice(len_lk)], length)
-                .to_numpy()
-                .astype(np.intp, copy=False),
-                pc.fill_null(dc.indices[slice(len_lk, None)], length)
-                .to_numpy()
-                .astype(np.intp, copy=False),
-                len(dc.dictionary),
-            )
-            if how == "right":
-                return rlab, llab, count
-            return llab, rlab, count
-
-        if not isinstance(lk, BaseMaskedArray) and not (
-            # exclude arrow dtypes that would get cast to object
-            isinstance(lk.dtype, ArrowDtype)
-            and (
-                is_numeric_dtype(lk.dtype.numpy_dtype)
-                or is_string_dtype(lk.dtype)
-                and not sort
-            )
-        ):
-            lk, _ = lk._values_for_factorize()
+        llab, rlab, count = lk._factorize_with_other(rk)
 
-            # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
-            # "_values_for_factorize"
-            rk, _ = rk._values_for_factorize()  # type: ignore[union-attr]
+        if how == "right":
+            return rlab, llab, count
+        return llab, rlab, count
 
     if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
         # GH#23917 TODO: Needs tests for non-matching dtypes
@@ -2471,61 +2394,19 @@ def _factorize_keys(
         lk = np.asarray(lk, dtype=np.int64)
         rk = np.asarray(rk, dtype=np.int64)
 
-    klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
+    lk, rk = _convert_arrays(lk, rk)
 
-    rizer = klass(max(len(lk), len(rk)))
-
-    if isinstance(lk, BaseMaskedArray):
-        assert isinstance(rk, BaseMaskedArray)
-        llab = rizer.factorize(lk._data, mask=lk._mask)
-        rlab = rizer.factorize(rk._data, mask=rk._mask)
-    elif isinstance(lk, ArrowExtensionArray):
-        assert isinstance(rk, ArrowExtensionArray)
-        # we can only get here with numeric dtypes
-        # TODO: Remove when we have a Factorizer for Arrow
-        llab = rizer.factorize(
-            lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
-        )
-        rlab = rizer.factorize(
-            rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
-        )
+    if isinstance(lk, ExtensionArray):
+        llab, rlab, count = lk._factorize_with_other(rk)
     else:
-        # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
-        # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
-        # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
-        llab = rizer.factorize(lk)  # type: ignore[arg-type]
-        rlab = rizer.factorize(rk)  # type: ignore[arg-type]
-    assert llab.dtype == np.dtype(np.intp), llab.dtype
-    assert rlab.dtype == np.dtype(np.intp), rlab.dtype
-
-    count = rizer.get_count()
-
-    if sort:
-        uniques = rizer.uniques.to_array()
-        llab, rlab = _sort_labels(uniques, llab, rlab)
-
-    # NA group
-    lmask = llab == -1
-    lany = lmask.any()
-    rmask = rlab == -1
-    rany = rmask.any()
-
-    if lany or rany:
-        if lany:
-            np.putmask(llab, lmask, count)
-        if rany:
-            np.putmask(rlab, rmask, count)
-        count += 1
+        llab, rlab, count = factorize_with_rizer(lk, rk, sort)
 
     if how == "right":
         return rlab, llab, count
     return llab, rlab, count
 
 
-def _convert_arrays_and_get_rizer_klass(
-    lk: ArrayLike, rk: ArrayLike
-) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]:
-    klass: type[libhashtable.Factorizer]
+def _convert_arrays(lk: ArrayLike, rk: ArrayLike) -> tuple[ArrayLike, ArrayLike]:
     if is_numeric_dtype(lk.dtype):
         if lk.dtype != rk.dtype:
             dtype = find_common_type([lk.dtype, rk.dtype])
@@ -2543,32 +2424,11 @@ def _convert_arrays_and_get_rizer_klass(
             else:
                 lk = lk.astype(dtype)
                 rk = rk.astype(dtype)
-        if isinstance(lk, BaseMaskedArray):
-            #  Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
-            #  expected type "Type[object]"
-            klass = _factorizers[lk.dtype.type]  # type: ignore[index]
-        elif isinstance(lk.dtype, ArrowDtype):
-            klass = _factorizers[lk.dtype.numpy_dtype.type]
-        else:
-            klass = _factorizers[lk.dtype.type]
 
     else:
-        klass = libhashtable.ObjectFactorizer
         lk = ensure_object(lk)
         rk = ensure_object(rk)
-    return klass, lk, rk
-
-
-def _sort_labels(
-    uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
-    llength = len(left)
-    labels = np.concatenate([left, right])
-
-    _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True)
-    new_left, new_right = new_labels[:llength], new_labels[llength:]
-
-    return new_left, new_right
+    return lk, rk
 
 
 def _get_join_keys(
diff --git a/pandas/core/reshape/merge_utils.py b/pandas/core/reshape/merge_utils.py
new file mode 100644
index 0000000000000..e4be47454f0cf
--- /dev/null
+++ b/pandas/core/reshape/merge_utils.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import numpy as np
+import numpy.typing as npt
+
+import pandas._libs.hashtable as libhashtable
+
+import pandas.core.algorithms as algos
+
+_factorizers = {
+    np.int64: libhashtable.Int64Factorizer,
+    np.longlong: libhashtable.Int64Factorizer,
+    np.int32: libhashtable.Int32Factorizer,
+    np.int16: libhashtable.Int16Factorizer,
+    np.int8: libhashtable.Int8Factorizer,
+    np.uint64: libhashtable.UInt64Factorizer,
+    np.uint32: libhashtable.UInt32Factorizer,
+    np.uint16: libhashtable.UInt16Factorizer,
+    np.uint8: libhashtable.UInt8Factorizer,
+    np.bool_: libhashtable.UInt8Factorizer,
+    np.float64: libhashtable.Float64Factorizer,
+    np.float32: libhashtable.Float32Factorizer,
+    np.complex64: libhashtable.Complex64Factorizer,
+    np.complex128: libhashtable.Complex128Factorizer,
+    np.object_: libhashtable.ObjectFactorizer,
+}
+
+# See https://github.com/pandas-dev/pandas/issues/52451
+if np.intc is not np.int32:
+    _factorizers[np.intc] = libhashtable.Int64Factorizer
+
+
+def _sort_labels(
+    uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
+    llength = len(left)
+    labels = np.concatenate([left, right])
+
+    _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True)
+    new_left, new_right = new_labels[:llength], new_labels[llength:]
+
+    return new_left, new_right
+
+
+def factorize_with_rizer(lk, rk, sort: bool = False, lk_mask=None, rk_mask=None):
+    rizer = _factorizers[lk.dtype.type](max(len(lk), len(rk)))
+    llab = rizer.factorize(lk, mask=lk_mask)
+    rlab = rizer.factorize(rk, mask=rk_mask)
+    count = rizer.get_count()
+
+    if sort:
+        uniques = rizer.uniques.to_array()
+        llab, rlab = _sort_labels(uniques, llab, rlab)
+
+    lmask = llab == -1
+    lany = lmask.any()
+    rmask = rlab == -1
+    rany = rmask.any()
+
+    if lany or rany:
+        if lany:
+            np.putmask(llab, lmask, count)
+        if rany:
+            np.putmask(rlab, rmask, count)
+        count += 1
+    return llab, rlab, count

From 26b9db42f6feedbb73aa1c5f4ee3df6b65a44c4f Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 3 Sep 2023 01:40:15 +0200
Subject: [PATCH 02/16] ENH: Move factorizing for merge to EA interface

---
 pandas/core/arrays/datetimelike.py |  8 ++++++++
 pandas/core/reshape/merge.py       | 28 +++++++---------------------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 52596f29ffc0c..9e4842a92c922 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -139,6 +139,7 @@
     invalid_comparison,
     make_invalid_op,
 )
+from pandas.core.reshape.merge_utils import factorize_with_rizer
 
 from pandas.tseries import frequencies
 
@@ -1703,6 +1704,13 @@ def _groupby_op(
         res_values = res_values.view(self._ndarray.dtype)
         return self._from_backing_data(res_values)
 
+    def _factorize_with_other(
+        self, other: DatetimeLikeArrayMixin, sort: bool = False
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        lk = np.asarray(self._ndarray, dtype=np.int64)
+        rk = np.asarray(other._ndarray, dtype=np.int64)
+        return factorize_with_rizer(lk, rk, sort)
+
 
 class DatelikeOps(DatetimeLikeArrayMixin):
     """
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 126bf61ddae54..756a22ffb4e5e 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -2374,32 +2374,18 @@ def _factorize_keys(
     if (
         isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
     ) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")):
-        # Extract the ndarray (UTC-localized) values
         # Note: we dont need the dtypes to match, as these can still be compared
         lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
-        lk = cast("DatetimeArray", lk)._ndarray
-        rk = cast("DatetimeArray", rk)._ndarray
 
-    elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
-        llab, rlab, count = lk._factorize_with_other(rk)
-
-        if how == "right":
-            return rlab, llab, count
-        return llab, rlab, count
-
-    if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
-        # GH#23917 TODO: Needs tests for non-matching dtypes
-        # GH#23917 TODO: needs tests for case where lk is integer-dtype
-        #  and rk is datetime-dtype
-        lk = np.asarray(lk, dtype=np.int64)
-        rk = np.asarray(rk, dtype=np.int64)
-
-    lk, rk = _convert_arrays(lk, rk)
-
-    if isinstance(lk, ExtensionArray):
+    if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
         llab, rlab, count = lk._factorize_with_other(rk)
     else:
-        llab, rlab, count = factorize_with_rizer(lk, rk, sort)
+        lk, rk = _convert_arrays(lk, rk)
+
+        if isinstance(lk, ExtensionArray):
+            llab, rlab, count = lk._factorize_with_other(rk)
+        else:
+            llab, rlab, count = factorize_with_rizer(lk, rk, sort)
 
     if how == "right":
         return rlab, llab, count

From 56043fe61d8632fca971f456c4cf3441c8c6e971 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 3 Sep 2023 02:24:00 +0200
Subject: [PATCH 03/16] Add sort

---
 pandas/core/reshape/merge.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 756a22ffb4e5e..6db78b71e172d 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -2378,12 +2378,12 @@ def _factorize_keys(
         lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
 
     if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
-        llab, rlab, count = lk._factorize_with_other(rk)
+        llab, rlab, count = lk._factorize_with_other(rk, sort)
     else:
         lk, rk = _convert_arrays(lk, rk)
 
         if isinstance(lk, ExtensionArray):
-            llab, rlab, count = lk._factorize_with_other(rk)
+            llab, rlab, count = lk._factorize_with_other(rk, sort)
         else:
             llab, rlab, count = factorize_with_rizer(lk, rk, sort)
 

From 58c31838613a7b70d7b662b215cf08aa15a760ea Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sun, 3 Sep 2023 02:26:09 +0200
Subject: [PATCH 04/16] Fix json tests

---
 pandas/core/arrays/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 171e1c29c550b..3abf7740ff51a 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2288,8 +2288,8 @@ def _groupby_op(
     def _factorize_with_other(
         self, other: ExtensionArray, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
-        lk = self.astype(object)
-        rk = other.astype(object)
+        lk, _ = self._values_for_factorize()
+        rk, _ = other._values_for_factorize()
         return factorize_with_rizer(lk, rk, sort)
 
 

From 9fd6c6636189a0de19260f456227da36f06027ca Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 3 Sep 2023 16:45:34 +0200
Subject: [PATCH 05/16] Update for timestamps and duration

---
 pandas/core/arrays/arrow/array.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index eed1fe0002fb1..811734a265ab4 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -36,6 +36,7 @@
     is_integer,
     is_list_like,
     is_scalar,
+    needs_i8_conversion,
 )
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.core.dtypes.missing import isna
@@ -2561,9 +2562,17 @@ def _factorize_with_other(
             pa.types.is_floating(self.dtype.pyarrow_dtype)
             or pa.types.is_integer(self.dtype.pyarrow_dtype)
             or pa.types.is_unsigned_integer(self.dtype.pyarrow_dtype)
+            or pa.types.is_duration(self.dtype.pyarrow_dtype)
+            or pa.types.is_date(self.dtype.pyarrow_dtype)
+            or pa.types.is_timestamp(self.dtype.pyarrow_dtype)
         ):
             lk = self.to_numpy(na_value=1, dtype=self.dtype.numpy_dtype)
             rk = other.to_numpy(na_value=1, dtype=lk.dtype)
+
+            if needs_i8_conversion(lk.dtype):
+                lk = np.asarray(lk, dtype=np.int64)
+                rk = np.asarray(rk, dtype=np.int64)
+
             return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna())
 
         len_lk = len(self)

From 164d9c39ce6dbf052ee31ec84310ef19f08a68ef Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 3 Sep 2023 20:00:11 +0200
Subject: [PATCH 06/16] Fix typing

---
 pandas/core/arrays/arrow/array.py  | 14 +++++++-------
 pandas/core/arrays/categorical.py  |  6 +++---
 pandas/core/arrays/datetimelike.py |  4 +++-
 pandas/core/arrays/masked.py       |  2 +-
 pandas/core/reshape/merge.py       |  2 ++
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 811734a265ab4..63a1f11021e69 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2556,7 +2556,7 @@ def _dt_tz_convert(self, tz):
         return type(self)(result)
 
     def _factorize_with_other(
-        self, other: ArrowExtensionArray, sort: bool = False
+        self, other: ArrowExtensionArray, sort: bool = False  # type: ignore[override]
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         if not isinstance(self.dtype, StringDtype) and (
             pa.types.is_floating(self.dtype.pyarrow_dtype)
@@ -2575,21 +2575,21 @@ def _factorize_with_other(
 
             return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna())
 
-        len_lk = len(self)
-        lk = self._pa_array  # type: ignore[attr-defined]
-        rk = other._pa_array  # type: ignore[union-attr]
+        len_left = len(self)
+        left = self._pa_array
+        right = other._pa_array
         dc = (
-            pa.chunked_array(lk.chunks + rk.chunks)  # type: ignore[union-attr]
+            pa.chunked_array(left.chunks + right.chunks)
             .combine_chunks()
             .dictionary_encode()
         )
         length = len(dc.dictionary)
 
         llab, rlab, count = (
-            pc.fill_null(dc.indices[slice(len_lk)], length)
+            pc.fill_null(dc.indices[slice(len_left)], length)
             .to_numpy()
             .astype(np.intp, copy=False),
-            pc.fill_null(dc.indices[slice(len_lk, None)], length)
+            pc.fill_null(dc.indices[slice(len_left, None)], length)
             .to_numpy()
             .astype(np.intp, copy=False),
             len(dc.dictionary),
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 31ba73d340b72..7945441e43388 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2719,12 +2719,12 @@ def _groupby_op(
         return self._from_backing_data(res_values)
 
     def _factorize_with_other(
-        self, other: Categorical, sort: bool = False
+        self, other: Categorical, sort: bool = False  # type: ignore[override]
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
-        rk = self._encode_with_my_categories(other)
+        other = self._encode_with_my_categories(other)
 
         lk = ensure_int64(self.codes)
-        rk = ensure_int64(rk.codes)
+        rk = ensure_int64(other.codes)
         return factorize_with_rizer(lk, rk, sort)
 
 
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 9e4842a92c922..bb2340433db2c 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -1705,7 +1705,9 @@ def _groupby_op(
         return self._from_backing_data(res_values)
 
     def _factorize_with_other(
-        self, other: DatetimeLikeArrayMixin, sort: bool = False
+        self,
+        other: DatetimeLikeArrayMixin,  # type: ignore[override]
+        sort: bool = False,
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         lk = np.asarray(self._ndarray, dtype=np.int64)
         rk = np.asarray(other._ndarray, dtype=np.int64)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index e5e8648446cd9..ff98493be73bf 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -1507,7 +1507,7 @@ def _groupby_op(
         return self._maybe_mask_result(res_values, result_mask)
 
     def _factorize_with_other(
-        self, other: BaseMaskedArray, sort: bool = False
+        self, other: BaseMaskedArray, sort: bool = False  # type: ignore[override]
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         return factorize_with_rizer(
             self._data, other._data, sort, self._mask, other._mask
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 6db78b71e172d..839e8b05d404e 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -2378,11 +2378,13 @@ def _factorize_keys(
         lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
 
     if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
+        assert isinstance(rk, ExtensionArray)
         llab, rlab, count = lk._factorize_with_other(rk, sort)
     else:
         lk, rk = _convert_arrays(lk, rk)
 
         if isinstance(lk, ExtensionArray):
+            assert isinstance(rk, ExtensionArray)
             llab, rlab, count = lk._factorize_with_other(rk, sort)
         else:
             llab, rlab, count = factorize_with_rizer(lk, rk, sort)

From 5d7151dd8ae9784079495a64b3721678cd2a62a0 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Thu, 7 Sep 2023 23:19:29 +0200
Subject: [PATCH 07/16] Update names

---
 pandas/core/arrays/arrow/array.py  |  8 ++++----
 pandas/core/arrays/base.py         | 26 ++++----------------------
 pandas/core/arrays/categorical.py  |  8 ++++----
 pandas/core/arrays/datetimelike.py | 10 ++++------
 pandas/core/arrays/masked.py       | 10 ++++------
 pandas/core/reshape/merge.py       |  8 ++++----
 pandas/core/reshape/merge_utils.py | 18 ++++++++++++------
 7 files changed, 36 insertions(+), 52 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 63a1f11021e69..908c3e328f731 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -59,7 +59,7 @@
     unpack_tuple_and_ellipses,
     validate_indices,
 )
-from pandas.core.reshape.merge_utils import factorize_with_rizer
+from pandas.core.reshape.merge_utils import factorize_arrays
 from pandas.core.strings.base import BaseStringArrayMethods
 
 from pandas.io._util import _arrow_dtype_mapping
@@ -2555,8 +2555,8 @@ def _dt_tz_convert(self, tz):
         result = self._pa_array.cast(pa.timestamp(current_unit, tz))
         return type(self)(result)
 
-    def _factorize_with_other(
-        self, other: ArrowExtensionArray, sort: bool = False  # type: ignore[override]
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False  # type: ignore[override]
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         if not isinstance(self.dtype, StringDtype) and (
             pa.types.is_floating(self.dtype.pyarrow_dtype)
@@ -2573,7 +2573,7 @@ def _factorize_with_other(
                 lk = np.asarray(lk, dtype=np.int64)
                 rk = np.asarray(rk, dtype=np.int64)
 
-            return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna())
+            return factorize_arrays(lk, rk, sort, self.isna(), other.isna())
 
         len_left = len(self)
         left = self._pa_array
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 3abf7740ff51a..7a303fccd7a06 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -24,7 +24,6 @@
 
 from pandas._libs import (
     algos as libalgos,
-    hashtable as libhashtable,
     lib,
 )
 from pandas.compat import set_function_name
@@ -70,7 +69,7 @@
     unique,
 )
 from pandas.core.array_algos.quantile import quantile_with_mask
-from pandas.core.reshape.merge_utils import factorize_with_rizer
+from pandas.core.reshape.merge_utils import factorize_arrays
 from pandas.core.sorting import (
     nargminmax,
     nargsort,
@@ -104,23 +103,6 @@
     from pandas import Index
 
 _extension_array_shared_docs: dict[str, str] = {}
-_factorizers = {
-    np.int64: libhashtable.Int64Factorizer,
-    np.longlong: libhashtable.Int64Factorizer,
-    np.int32: libhashtable.Int32Factorizer,
-    np.int16: libhashtable.Int16Factorizer,
-    np.int8: libhashtable.Int8Factorizer,
-    np.uint64: libhashtable.UInt64Factorizer,
-    np.uint32: libhashtable.UInt32Factorizer,
-    np.uint16: libhashtable.UInt16Factorizer,
-    np.uint8: libhashtable.UInt8Factorizer,
-    np.bool_: libhashtable.UInt8Factorizer,
-    np.float64: libhashtable.Float64Factorizer,
-    np.float32: libhashtable.Float32Factorizer,
-    np.complex64: libhashtable.Complex64Factorizer,
-    np.complex128: libhashtable.Complex128Factorizer,
-    np.object_: libhashtable.ObjectFactorizer,
-}
 
 
 class ExtensionArray:
@@ -2285,12 +2267,12 @@ def _groupby_op(
         else:
             raise NotImplementedError
 
-    def _factorize_with_other(
-        self, other: ExtensionArray, sort: bool = False
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         lk, _ = self._values_for_factorize()
         rk, _ = other._values_for_factorize()
-        return factorize_with_rizer(lk, rk, sort)
+        return factorize_arrays(lk, rk, sort)
 
 
 class ExtensionArraySupportsAnyAll(ExtensionArray):
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 28fc482cf3ae4..ee19fcf1c666d 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -84,7 +84,7 @@
     sanitize_array,
 )
 from pandas.core.ops.common import unpack_zerodim_and_defer
-from pandas.core.reshape.merge_utils import factorize_with_rizer
+from pandas.core.reshape.merge_utils import factorize_arrays
 from pandas.core.sorting import nargsort
 from pandas.core.strings.object_array import ObjectStringArrayMixin
 
@@ -2718,14 +2718,14 @@ def _groupby_op(
             res_values[result_mask == 1] = -1
         return self._from_backing_data(res_values)
 
-    def _factorize_with_other(
-        self, other: Categorical, sort: bool = False  # type: ignore[override]
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False  # type: ignore[override]
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         other = self._encode_with_my_categories(other)
 
         lk = ensure_int64(self.codes)
         rk = ensure_int64(other.codes)
-        return factorize_with_rizer(lk, rk, sort)
+        return factorize_arrays(lk, rk, sort)
 
 
 # The Series.cat accessor
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index bb2340433db2c..16fd3a1d8404e 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -139,7 +139,7 @@
     invalid_comparison,
     make_invalid_op,
 )
-from pandas.core.reshape.merge_utils import factorize_with_rizer
+from pandas.core.reshape.merge_utils import factorize_arrays
 
 from pandas.tseries import frequencies
 
@@ -1704,14 +1704,12 @@ def _groupby_op(
         res_values = res_values.view(self._ndarray.dtype)
         return self._from_backing_data(res_values)
 
-    def _factorize_with_other(
-        self,
-        other: DatetimeLikeArrayMixin,  # type: ignore[override]
-        sort: bool = False,
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         lk = np.asarray(self._ndarray, dtype=np.int64)
         rk = np.asarray(other._ndarray, dtype=np.int64)
-        return factorize_with_rizer(lk, rk, sort)
+        return factorize_arrays(lk, rk, sort)
 
 
 class DatelikeOps(DatetimeLikeArrayMixin):
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index ff98493be73bf..5cdddb9278e26 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -85,7 +85,7 @@
 )
 from pandas.core.indexers import check_array_indexer
 from pandas.core.ops import invalid_comparison
-from pandas.core.reshape.merge_utils import factorize_with_rizer
+from pandas.core.reshape.merge_utils import factorize_arrays
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -1506,12 +1506,10 @@ def _groupby_op(
         #  wrap in a MaskedArray
         return self._maybe_mask_result(res_values, result_mask)
 
-    def _factorize_with_other(
-        self, other: BaseMaskedArray, sort: bool = False  # type: ignore[override]
+    def _factorize_with_other_for_merge(
+        self, other: Self, sort: bool = False  # type: ignore[override]
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
-        return factorize_with_rizer(
-            self._data, other._data, sort, self._mask, other._mask
-        )
+        return factorize_arrays(self._data, other._data, sort, self._mask, other._mask)
 
 
 def transpose_homogeneous_masked_arrays(
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 839e8b05d404e..95431b497ecf0 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -94,7 +94,7 @@
 )
 from pandas.core.frame import _merge_doc
 from pandas.core.indexes.api import default_index
-from pandas.core.reshape.merge_utils import factorize_with_rizer
+from pandas.core.reshape.merge_utils import factorize_arrays
 from pandas.core.sorting import is_int64_overflow_possible
 
 if TYPE_CHECKING:
@@ -2379,15 +2379,15 @@ def _factorize_keys(
 
     if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
         assert isinstance(rk, ExtensionArray)
-        llab, rlab, count = lk._factorize_with_other(rk, sort)
+        llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort)
     else:
         lk, rk = _convert_arrays(lk, rk)
 
         if isinstance(lk, ExtensionArray):
             assert isinstance(rk, ExtensionArray)
-            llab, rlab, count = lk._factorize_with_other(rk, sort)
+            llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort)
         else:
-            llab, rlab, count = factorize_with_rizer(lk, rk, sort)
+            llab, rlab, count = factorize_arrays(lk, rk, sort)
 
     if how == "right":
         return rlab, llab, count
diff --git a/pandas/core/reshape/merge_utils.py b/pandas/core/reshape/merge_utils.py
index e4be47454f0cf..1ec9c81250815 100644
--- a/pandas/core/reshape/merge_utils.py
+++ b/pandas/core/reshape/merge_utils.py
@@ -42,14 +42,20 @@ def _sort_labels(
     return new_left, new_right
 
 
-def factorize_with_rizer(lk, rk, sort: bool = False, lk_mask=None, rk_mask=None):
-    rizer = _factorizers[lk.dtype.type](max(len(lk), len(rk)))
-    llab = rizer.factorize(lk, mask=lk_mask)
-    rlab = rizer.factorize(rk, mask=rk_mask)
-    count = rizer.get_count()
+def factorize_arrays(
+    lk: np.ndarray,
+    rk: np.ndarray,
+    sort: bool = False,
+    lk_mask: np.ndarray | None = None,
+    rk_mask: np.ndarray | None = None,
+):
+    factorizer = _factorizers[lk.dtype.type](max(len(lk), len(rk)))
+    llab = factorizer.factorize(lk, mask=lk_mask)
+    rlab = factorizer.factorize(rk, mask=rk_mask)
+    count = factorizer.get_count()
 
     if sort:
-        uniques = rizer.uniques.to_array()
+        uniques = factorizer.uniques.to_array()
         llab, rlab = _sort_labels(uniques, llab, rlab)
 
     lmask = llab == -1

From de02cb0bc8c1db1240e82eff2f8fc7466aaff1e3 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Thu, 7 Sep 2023 23:24:14 +0200
Subject: [PATCH 08/16] Add docs

---
 doc/source/reference/extensions.rst |  1 +
 pandas/core/arrays/base.py          | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
index e177e2b1d87d5..cf50be8508f23 100644
--- a/doc/source/reference/extensions.rst
+++ b/doc/source/reference/extensions.rst
@@ -34,6 +34,7 @@ objects.
 
       api.extensions.ExtensionArray._accumulate
       api.extensions.ExtensionArray._concat_same_type
+      api.extensions.ExtensionArray._factorize_with_other_for_merge
       api.extensions.ExtensionArray._formatter
       api.extensions.ExtensionArray._from_factorized
       api.extensions.ExtensionArray._from_sequence
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 7a303fccd7a06..9d4867c8d6652 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2270,6 +2270,20 @@ def _groupby_op(
     def _factorize_with_other_for_merge(
         self, other: Self, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        """Factorizes two arrays to get codes for merge operations.
+
+        This allows extension array authors to implement efficient factorizations
+        for merge operations.
+
+        Parameters
+        ----------
+        other : ExtensionArray with the same dtype as self.
+        sort : Whether to sort the result.
+
+        Returns
+        -------
+        tuple of codes for left and right and the number of unique elements.
+        """
         lk, _ = self._values_for_factorize()
         rk, _ = other._values_for_factorize()
         return factorize_arrays(lk, rk, sort)

From db84cfc5457ea8637cb113d38ce58dce37e18a14 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Thu, 7 Sep 2023 23:35:21 +0200
Subject: [PATCH 09/16] Fix

---
 pandas/core/arrays/categorical.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index ee19fcf1c666d..bcfabebda3116 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2722,10 +2722,7 @@ def _factorize_with_other_for_merge(
         self, other: Self, sort: bool = False  # type: ignore[override]
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         other = self._encode_with_my_categories(other)
-
-        lk = ensure_int64(self.codes)
-        rk = ensure_int64(other.codes)
-        return factorize_arrays(lk, rk, sort)
+        return factorize_arrays(self.codes, other.codes, sort)
 
 
 # The Series.cat accessor

From 84ff6c11867c51478594c0dab396916ac34bea4a Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Fri, 8 Sep 2023 21:56:59 +0200
Subject: [PATCH 10/16] Fix typing

---
 pandas/core/arrays/arrow/array.py | 2 +-
 pandas/core/arrays/categorical.py | 6 +++---
 pandas/core/arrays/masked.py      | 2 +-
 pandas/core/reshape/merge.py      | 1 +
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 908c3e328f731..58eb081ede896 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2556,7 +2556,7 @@ def _dt_tz_convert(self, tz):
         return type(self)(result)
 
     def _factorize_with_other_for_merge(
-        self, other: Self, sort: bool = False  # type: ignore[override]
+        self, other: Self, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         if not isinstance(self.dtype, StringDtype) and (
             pa.types.is_floating(self.dtype.pyarrow_dtype)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index bcfabebda3116..192b15018a644 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2719,10 +2719,10 @@ def _groupby_op(
         return self._from_backing_data(res_values)
 
     def _factorize_with_other_for_merge(
-        self, other: Self, sort: bool = False  # type: ignore[override]
+        self, other: Self, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
-        other = self._encode_with_my_categories(other)
-        return factorize_arrays(self.codes, other.codes, sort)
+        rk = self._encode_with_my_categories(other)
+        return factorize_arrays(self.codes, rk.codes, sort)
 
 
 # The Series.cat accessor
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 5cdddb9278e26..b8b52f5249e72 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -1507,7 +1507,7 @@ def _groupby_op(
         return self._maybe_mask_result(res_values, result_mask)
 
     def _factorize_with_other_for_merge(
-        self, other: Self, sort: bool = False  # type: ignore[override]
+        self, other: Self, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         return factorize_arrays(self._data, other._data, sort, self._mask, other._mask)
 
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 95431b497ecf0..a99ffc8252cee 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -2387,6 +2387,7 @@ def _factorize_keys(
             assert isinstance(rk, ExtensionArray)
             llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort)
         else:
+            assert isinstance(rk, np.ndarray)
             llab, rlab, count = factorize_arrays(lk, rk, sort)
 
     if how == "right":

From 0e71264832b85471686fefba1b04de09cbf04bb2 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 15 Oct 2023 15:49:44 +0200
Subject: [PATCH 11/16] Add null count

---
 pandas/core/arrays/arrow/array.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index c73337cd7cce5..c7e37823c6d7d 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2676,6 +2676,8 @@ def _factorize_with_other_for_merge(
             .astype(np.intp, copy=False),
             len(dc.dictionary),
         )
+        if dc.null_count > 0:
+            count += 1
         return llab, rlab, count
 
 

From 4238a8cb2f912445d3e02558d897961754ab27b6 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 15 Oct 2023 15:53:53 +0200
Subject: [PATCH 12/16] Update

---
 pandas/core/arrays/arrow/array.py  | 17 ++++++++---------
 pandas/core/arrays/base.py         |  1 +
 pandas/core/arrays/datetimelike.py |  1 +
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index c7e37823c6d7d..2d4c062861c76 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2648,6 +2648,8 @@ def _factorize_with_other_for_merge(
             or pa.types.is_date(self.dtype.pyarrow_dtype)
             or pa.types.is_timestamp(self.dtype.pyarrow_dtype)
         ):
+            # Keep using current logic
+            # TODO: check if new implementation with dictionary_encode is faster
             lk = self.to_numpy(na_value=1, dtype=self.dtype.numpy_dtype)
             rk = other.to_numpy(na_value=1, dtype=lk.dtype)
 
@@ -2667,15 +2669,12 @@ def _factorize_with_other_for_merge(
         )
         length = len(dc.dictionary)
 
-        llab, rlab, count = (
-            pc.fill_null(dc.indices[slice(len_left)], length)
-            .to_numpy()
-            .astype(np.intp, copy=False),
-            pc.fill_null(dc.indices[slice(len_left, None)], length)
-            .to_numpy()
-            .astype(np.intp, copy=False),
-            len(dc.dictionary),
-        )
+        llab = pc.fill_null(dc.indices[slice(len_left)], length)
+        llab = llab.to_numpy().astype(np.intp, copy=False)
+        rlab = pc.fill_null(dc.indices[slice(len_left, None)], length)
+        rlab = rlab.to_numpy().astype(np.intp, copy=False)
+        count = len(dc.dictionary)
+
         if dc.null_count > 0:
             count += 1
         return llab, rlab, count
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 13d8b3962839e..3ed6ef04cadb4 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2365,6 +2365,7 @@ def _factorize_with_other_for_merge(
         -------
         tuple of codes for left and right and the number of unique elements.
         """
+        # self and other have equal dtypes after _values_for_factorize
         lk, _ = self._values_for_factorize()
         rk, _ = other._values_for_factorize()
         return factorize_arrays(lk, rk, sort)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 9eb3ddefab87d..646c62accdb00 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -1708,6 +1708,7 @@ def _groupby_op(
     def _factorize_with_other_for_merge(
         self, other: Self, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
+        # caller is responsible for ensuring self.dtype == other.dtype
         lk = np.asarray(self._ndarray, dtype=np.int64)
         rk = np.asarray(other._ndarray, dtype=np.int64)
         return factorize_arrays(lk, rk, sort)

From dea9add8ad4b49e07b69e4b0746cc5d513863f97 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 15 Oct 2023 15:58:26 +0200
Subject: [PATCH 13/16] Fix docstring

---
 ci/code_checks.sh          | 3 +++
 pandas/core/arrays/base.py | 7 ++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 6caa39ae42926..42af688340a9f 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -1,3 +1,6 @@
+
+
+
 #!/bin/bash
 #
 # Run checks related to code quality.
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 3ed6ef04cadb4..d9df509299707 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2351,15 +2351,16 @@ def _groupby_op(
     def _factorize_with_other_for_merge(
         self, other: Self, sort: bool = False
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
-        """Factorizes two arrays to get codes for merge operations.
+        """
+        Factorize two arrays to get codes for merge operations.
 
         This allows extension array authors to implement efficient factorizations
         for merge operations.
 
         Parameters
         ----------
-        other : ExtensionArray with the same dtype as self.
-        sort : Whether to sort the result.
+        other : ExtensionArray with the same dtype as self
+        sort : Whether to sort the result
 
         Returns
         -------

From 8f932836d7ad85cb8451bc31d9dad97b40b535f7 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Sun, 15 Oct 2023 23:40:43 +0200
Subject: [PATCH 14/16] Add whatsnew

---
 doc/source/whatsnew/v2.2.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index ef59c86a21598..36b211ae21073 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -78,6 +78,7 @@ Other enhancements
 - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
 - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
 - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
+- :meth:`ExtensionArray._factorize_with_other_for_merge` interface method added to allow efficient extension type factorize implementations for merge (:issue:`54975`)
 - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
 - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
 - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)

From eb65303f7e0258f36454f9df137460dbbf7be91a Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 16 Oct 2023 18:44:09 +0200
Subject: [PATCH 15/16] Update code_checks.sh

---
 ci/code_checks.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 42af688340a9f..6caa39ae42926 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -1,6 +1,3 @@
-
-
-
 #!/bin/bash
 #
 # Run checks related to code quality.

From dc5f2cafaf5394ef64b6603c9ae249e8e516efd8 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Tue, 21 Nov 2023 21:30:37 +0100
Subject: [PATCH 16/16] Update docs

---
 pandas/core/arrays/base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 0509d24867c65..7799d44afc91b 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2401,6 +2401,10 @@ def _factorize_with_other_for_merge(
         Returns
         -------
         tuple of codes for left and right and the number of unique elements.
+
+        Examples
+        --------
+        This is a factorize of 2 different arrays.
         """
         # self and other have equal dtypes after _values_for_factorize
         lk, _ = self._values_for_factorize()