From 917e50bceff22084eb0073d3a9021e581e6c3f6f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 3 Sep 2023 01:30:39 +0200 Subject: [PATCH 01/16] ENH: Move factorizing for merge to EA interface --- pandas/core/arrays/arrow/array.py | 34 ++++++ pandas/core/arrays/base.py | 26 +++++ pandas/core/arrays/categorical.py | 10 ++ pandas/core/arrays/masked.py | 8 ++ pandas/core/reshape/merge.py | 162 ++--------------------------- pandas/core/reshape/merge_utils.py | 66 ++++++++++++ 6 files changed, 155 insertions(+), 151 deletions(-) create mode 100644 pandas/core/reshape/merge_utils.py diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4d887ecd1510f..eed1fe0002fb1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -58,6 +58,7 @@ unpack_tuple_and_ellipses, validate_indices, ) +from pandas.core.reshape.merge_utils import factorize_with_rizer from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -2553,6 +2554,39 @@ def _dt_tz_convert(self, tz): result = self._pa_array.cast(pa.timestamp(current_unit, tz)) return type(self)(result) + def _factorize_with_other( + self, other: ArrowExtensionArray, sort: bool = False + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + if not isinstance(self.dtype, StringDtype) and ( + pa.types.is_floating(self.dtype.pyarrow_dtype) + or pa.types.is_integer(self.dtype.pyarrow_dtype) + or pa.types.is_unsigned_integer(self.dtype.pyarrow_dtype) + ): + lk = self.to_numpy(na_value=1, dtype=self.dtype.numpy_dtype) + rk = other.to_numpy(na_value=1, dtype=lk.dtype) + return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna()) + + len_lk = len(self) + lk = self._pa_array # type: ignore[attr-defined] + rk = other._pa_array # type: ignore[union-attr] + dc = ( + pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr] + .combine_chunks() + .dictionary_encode() + ) + length = len(dc.dictionary) + + llab, rlab, count = ( + pc.fill_null(dc.indices[slice(len_lk)], length) + .to_numpy() + .astype(np.intp, copy=False), + pc.fill_null(dc.indices[slice(len_lk, None)], length) + .to_numpy() + .astype(np.intp, copy=False), + len(dc.dictionary), + ) + return llab, rlab, count + def transpose_homogeneous_pyarrow( arrays: Sequence[ArrowExtensionArray], diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f3bb7323c7d5f..171e1c29c550b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -24,6 +24,7 @@ from pandas._libs import ( algos as libalgos, + hashtable as libhashtable, lib, ) from pandas.compat import set_function_name @@ -69,6 +70,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.reshape.merge_utils import factorize_with_rizer from pandas.core.sorting import ( nargminmax, nargsort, @@ -102,6 +104,23 @@ from pandas import Index _extension_array_shared_docs: dict[str, str] = {} +_factorizers = { + np.int64: libhashtable.Int64Factorizer, + np.longlong: libhashtable.Int64Factorizer, + np.int32: libhashtable.Int32Factorizer, + np.int16: libhashtable.Int16Factorizer, + np.int8: libhashtable.Int8Factorizer, + np.uint64: libhashtable.UInt64Factorizer, + np.uint32: libhashtable.UInt32Factorizer, + np.uint16: libhashtable.UInt16Factorizer, + np.uint8: libhashtable.UInt8Factorizer, + np.bool_: libhashtable.UInt8Factorizer, + np.float64: libhashtable.Float64Factorizer, + np.float32: libhashtable.Float32Factorizer, + np.complex64: libhashtable.Complex64Factorizer, + np.complex128: libhashtable.Complex128Factorizer, + np.object_: libhashtable.ObjectFactorizer, +} class ExtensionArray: @@ -2266,6 +2285,13 @@ def _groupby_op( else: raise NotImplementedError + def _factorize_with_other( + self, other: ExtensionArray, sort: bool = False + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + lk = self.astype(object) + rk = other.astype(object) + return factorize_with_rizer(lk, rk, sort) + class ExtensionArraySupportsAnyAll(ExtensionArray): def any(self, *, skipna: bool = True) -> bool: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9f63d1f97c54f..31ba73d340b72 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -84,6 +84,7 @@ sanitize_array, ) from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.reshape.merge_utils import factorize_with_rizer from pandas.core.sorting import nargsort from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -2717,6 +2718,15 @@ def _groupby_op( res_values[result_mask == 1] = -1 return self._from_backing_data(res_values) + def _factorize_with_other( + self, other: Categorical, sort: bool = False + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + rk = self._encode_with_my_categories(other) + + lk = ensure_int64(self.codes) + rk = ensure_int64(rk.codes) + return factorize_with_rizer(lk, rk, sort) + # The Series.cat accessor diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2cf28c28427ab..e5e8648446cd9 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -85,6 +85,7 @@ ) from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison +from pandas.core.reshape.merge_utils import factorize_with_rizer if TYPE_CHECKING: from collections.abc import ( @@ -1505,6 +1506,13 @@ def _groupby_op( # wrap in a MaskedArray return self._maybe_mask_result(res_values, result_mask) + def _factorize_with_other( + self, other: BaseMaskedArray, sort: bool = False + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + return factorize_with_rizer( + self._data, other._data, sort, self._mask, other._mask + ) + def transpose_homogeneous_masked_arrays( masked_arrays: Sequence[BaseMaskedArray], diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8ef3943ab0d8d..126bf61ddae54 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -23,7 +23,6 @@ from pandas._libs import ( Timedelta, - hashtable as libhashtable, join as libjoin, lib, ) @@ -49,7 +48,6 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( - ensure_int64, ensure_object, is_bool, is_bool_dtype, @@ -61,7 +59,6 @@ is_number, is_numeric_dtype, is_object_dtype, - is_string_dtype, needs_i8_conversion, ) from pandas.core.dtypes.dtypes import ( @@ -78,7 +75,6 @@ ) from pandas import ( - ArrowDtype, Categorical, Index, MultiIndex, @@ -91,7 +87,6 @@ ExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -99,6 +94,7 @@ ) from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index +from pandas.core.reshape.merge_utils import factorize_with_rizer from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -107,27 +103,6 @@ from pandas.core.arrays import DatetimeArray from pandas.core.indexes.frozen import FrozenList -_factorizers = { - np.int64: libhashtable.Int64Factorizer, - np.longlong: libhashtable.Int64Factorizer, - np.int32: libhashtable.Int32Factorizer, - np.int16: libhashtable.Int16Factorizer, - np.int8: libhashtable.Int8Factorizer, - np.uint64: libhashtable.UInt64Factorizer, - np.uint32: libhashtable.UInt32Factorizer, - np.uint16: libhashtable.UInt16Factorizer, - np.uint8: libhashtable.UInt8Factorizer, - np.bool_: libhashtable.UInt8Factorizer, - np.float64: libhashtable.Float64Factorizer, - np.float32: libhashtable.Float32Factorizer, - np.complex64: libhashtable.Complex64Factorizer, - np.complex128: libhashtable.Complex128Factorizer, - np.object_: libhashtable.ObjectFactorizer, -} - -# See https://github.com/pandas-dev/pandas/issues/52451 -if np.intc is not np.int32: - _factorizers[np.intc] = libhashtable.Int64Factorizer _known = (np.ndarray, ExtensionArray, Index, ABCSeries) @@ -2405,64 +2380,12 @@ def _factorize_keys( lk = cast("DatetimeArray", lk)._ndarray rk = cast("DatetimeArray", rk)._ndarray - elif ( - isinstance(lk.dtype, CategoricalDtype) - and isinstance(rk.dtype, CategoricalDtype) - and lk.dtype == rk.dtype - ): - assert isinstance(lk, Categorical) - assert isinstance(rk, Categorical) - # Cast rk to encoding so we can compare codes with lk - - rk = lk._encode_with_my_categories(rk) - - lk = ensure_int64(lk.codes) - rk = ensure_int64(rk.codes) - elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: - if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( - isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow" - ): - import pyarrow as pa - import pyarrow.compute as pc - - len_lk = len(lk) - lk = lk._pa_array # type: ignore[attr-defined] - rk = rk._pa_array # type: ignore[union-attr] - dc = ( - pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr] - .combine_chunks() - .dictionary_encode() - ) - length = len(dc.dictionary) - - llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) - .to_numpy() - .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) - .to_numpy() - .astype(np.intp, copy=False), - len(dc.dictionary), - ) - if how == "right": - return rlab, llab, count - return llab, rlab, count - - if not isinstance(lk, BaseMaskedArray) and not ( - # exclude arrow dtypes that would get cast to object - isinstance(lk.dtype, ArrowDtype) - and ( - is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort - ) - ): - lk, _ = lk._values_for_factorize() + llab, rlab, count = lk._factorize_with_other(rk) - # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute - # "_values_for_factorize" - rk, _ = rk._values_for_factorize() # type: ignore[union-attr] + if how == "right": + return rlab, llab, count + return llab, rlab, count if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: # GH#23917 TODO: Needs tests for non-matching dtypes @@ -2471,61 +2394,19 @@ def _factorize_keys( lk = np.asarray(lk, dtype=np.int64) rk = np.asarray(rk, dtype=np.int64) - klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk) + lk, rk = _convert_arrays(lk, rk) - rizer = klass(max(len(lk), len(rk))) - - if isinstance(lk, BaseMaskedArray): - assert isinstance(rk, BaseMaskedArray) - llab = rizer.factorize(lk._data, mask=lk._mask) - rlab = rizer.factorize(rk._data, mask=rk._mask) - elif isinstance(lk, ArrowExtensionArray): - assert isinstance(rk, ArrowExtensionArray) - # we can only get here with numeric dtypes - # TODO: Remove when we have a Factorizer for Arrow - llab = rizer.factorize( - lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() - ) - rlab = rizer.factorize( - rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() - ) + if isinstance(lk, ExtensionArray): + llab, rlab, count = lk._factorize_with_other(rk) else: - # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type - # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], - # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - llab = rizer.factorize(lk) # type: ignore[arg-type] - rlab = rizer.factorize(rk) # type: ignore[arg-type] - assert llab.dtype == np.dtype(np.intp), llab.dtype - assert rlab.dtype == np.dtype(np.intp), rlab.dtype - - count = rizer.get_count() - - if sort: - uniques = rizer.uniques.to_array() - llab, rlab = _sort_labels(uniques, llab, rlab) - - # NA group - lmask = llab == -1 - lany = lmask.any() - rmask = rlab == -1 - rany = rmask.any() - - if lany or rany: - if lany: - np.putmask(llab, lmask, count) - if rany: - np.putmask(rlab, rmask, count) - count += 1 + llab, rlab, count = factorize_with_rizer(lk, rk, sort) if how == "right": return rlab, llab, count return llab, rlab, count -def _convert_arrays_and_get_rizer_klass( - lk: ArrayLike, rk: ArrayLike -) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]: - klass: type[libhashtable.Factorizer] +def _convert_arrays(lk: ArrayLike, rk: ArrayLike) -> tuple[ArrayLike, ArrayLike]: if is_numeric_dtype(lk.dtype): if lk.dtype != rk.dtype: dtype = find_common_type([lk.dtype, rk.dtype]) @@ -2543,32 +2424,11 @@ def _convert_arrays_and_get_rizer_klass( else: lk = lk.astype(dtype) rk = rk.astype(dtype) - if isinstance(lk, BaseMaskedArray): - # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; - # expected type "Type[object]" - klass = _factorizers[lk.dtype.type] # type: ignore[index] - elif isinstance(lk.dtype, ArrowDtype): - klass = _factorizers[lk.dtype.numpy_dtype.type] - else: - klass = _factorizers[lk.dtype.type] else: - klass = libhashtable.ObjectFactorizer lk = ensure_object(lk) rk = ensure_object(rk) - return klass, lk, rk - - -def _sort_labels( - uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp] -) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - llength = len(left) - labels = np.concatenate([left, right]) - - _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True) - new_left, new_right = new_labels[:llength], new_labels[llength:] - - return new_left, new_right + return lk, rk def _get_join_keys( diff --git a/pandas/core/reshape/merge_utils.py b/pandas/core/reshape/merge_utils.py new file mode 100644 index 0000000000000..e4be47454f0cf --- /dev/null +++ b/pandas/core/reshape/merge_utils.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import numpy as np +import numpy.typing as npt + +import pandas._libs.hashtable as libhashtable + +import pandas.core.algorithms as algos + +_factorizers = { + np.int64: libhashtable.Int64Factorizer, + np.longlong: libhashtable.Int64Factorizer, + np.int32: libhashtable.Int32Factorizer, + np.int16: libhashtable.Int16Factorizer, + np.int8: libhashtable.Int8Factorizer, + np.uint64: libhashtable.UInt64Factorizer, + np.uint32: libhashtable.UInt32Factorizer, + np.uint16: libhashtable.UInt16Factorizer, + np.uint8: libhashtable.UInt8Factorizer, + np.bool_: libhashtable.UInt8Factorizer, + np.float64: libhashtable.Float64Factorizer, + np.float32: libhashtable.Float32Factorizer, + np.complex64: libhashtable.Complex64Factorizer, + np.complex128: libhashtable.Complex128Factorizer, + np.object_: libhashtable.ObjectFactorizer, +} + +# See https://github.com/pandas-dev/pandas/issues/52451 +if np.intc is not np.int32: + _factorizers[np.intc] = libhashtable.Int64Factorizer + + +def _sort_labels( + uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp] +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + llength = len(left) + labels = np.concatenate([left, right]) + + _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True) + new_left, new_right = new_labels[:llength], new_labels[llength:] + + return new_left, new_right + + +def factorize_with_rizer(lk, rk, sort: bool = False, lk_mask=None, rk_mask=None): + rizer = _factorizers[lk.dtype.type](max(len(lk), len(rk))) + llab = rizer.factorize(lk, mask=lk_mask) + rlab = rizer.factorize(rk, mask=rk_mask) + count = rizer.get_count() + + if sort: + uniques = rizer.uniques.to_array() + llab, rlab = _sort_labels(uniques, llab, rlab) + + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + + if lany or rany: + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) + count += 1 + return llab, rlab, count From 26b9db42f6feedbb73aa1c5f4ee3df6b65a44c4f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 3 Sep 2023 01:40:15 +0200 Subject: [PATCH 02/16] ENH: Move factorizing for merge to EA interface --- pandas/core/arrays/datetimelike.py | 8 ++++++++ pandas/core/reshape/merge.py | 28 +++++++--------------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 52596f29ffc0c..9e4842a92c922 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -139,6 +139,7 @@ invalid_comparison, make_invalid_op, ) +from pandas.core.reshape.merge_utils import factorize_with_rizer from pandas.tseries import frequencies @@ -1703,6 +1704,13 @@ def _groupby_op( res_values = res_values.view(self._ndarray.dtype) return self._from_backing_data(res_values) + def _factorize_with_other( + self, other: DatetimeLikeArrayMixin, sort: bool = False + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + lk = np.asarray(self._ndarray, dtype=np.int64) + rk = np.asarray(other._ndarray, dtype=np.int64) + return factorize_with_rizer(lk, rk, sort) + class DatelikeOps(DatetimeLikeArrayMixin): """ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 126bf61ddae54..756a22ffb4e5e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2374,32 +2374,18 @@ def _factorize_keys( if ( isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype) ) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")): - # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk) - lk = cast("DatetimeArray", lk)._ndarray - rk = cast("DatetimeArray", rk)._ndarray - elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: - llab, rlab, count = lk._factorize_with_other(rk) - - if how == "right": - return rlab, llab, count - return llab, rlab, count - - if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: - # GH#23917 TODO: Needs tests for non-matching dtypes - # GH#23917 TODO: needs tests for case where lk is integer-dtype - # and rk is datetime-dtype - lk = np.asarray(lk, dtype=np.int64) - rk = np.asarray(rk, dtype=np.int64) - - lk, rk = _convert_arrays(lk, rk) - - if isinstance(lk, ExtensionArray): + if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: llab, rlab, count = lk._factorize_with_other(rk) else: - llab, rlab, count = factorize_with_rizer(lk, rk, sort) + lk, rk = _convert_arrays(lk, rk) + + if isinstance(lk, ExtensionArray): + llab, rlab, count = lk._factorize_with_other(rk) + else: + llab, rlab, count = factorize_with_rizer(lk, rk, sort) if how == "right": return rlab, llab, count From 56043fe61d8632fca971f456c4cf3441c8c6e971 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 3 Sep 2023 02:24:00 +0200 Subject: [PATCH 03/16] Add sort --- pandas/core/reshape/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 756a22ffb4e5e..6db78b71e172d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2378,12 +2378,12 @@ def _factorize_keys( lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk) if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: - llab, rlab, count = lk._factorize_with_other(rk) + llab, rlab, count = lk._factorize_with_other(rk, sort) else: lk, rk = _convert_arrays(lk, rk) if isinstance(lk, ExtensionArray): - llab, rlab, count = lk._factorize_with_other(rk) + llab, rlab, count = lk._factorize_with_other(rk, sort) else: llab, rlab, count = factorize_with_rizer(lk, rk, sort) From 58c31838613a7b70d7b662b215cf08aa15a760ea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 3 Sep 2023 02:26:09 +0200 Subject: [PATCH 04/16] Fix json tests --- pandas/core/arrays/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 171e1c29c550b..3abf7740ff51a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2288,8 +2288,8 @@ def _groupby_op( def _factorize_with_other( self, other: ExtensionArray, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - lk = self.astype(object) - rk = other.astype(object) + lk, _ = self._values_for_factorize() + rk, _ = other._values_for_factorize() return factorize_with_rizer(lk, rk, sort) From 9fd6c6636189a0de19260f456227da36f06027ca Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 3 Sep 2023 16:45:34 +0200 Subject: [PATCH 05/16] Update for timestamps and duration --- pandas/core/arrays/arrow/array.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index eed1fe0002fb1..811734a265ab4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -36,6 +36,7 @@ is_integer, is_list_like, is_scalar, + needs_i8_conversion, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -2561,9 +2562,17 @@ def _factorize_with_other( pa.types.is_floating(self.dtype.pyarrow_dtype) or pa.types.is_integer(self.dtype.pyarrow_dtype) or pa.types.is_unsigned_integer(self.dtype.pyarrow_dtype) + or pa.types.is_duration(self.dtype.pyarrow_dtype) + or pa.types.is_date(self.dtype.pyarrow_dtype) + or pa.types.is_timestamp(self.dtype.pyarrow_dtype) ): lk = self.to_numpy(na_value=1, dtype=self.dtype.numpy_dtype) rk = other.to_numpy(na_value=1, dtype=lk.dtype) + + if needs_i8_conversion(lk.dtype): + lk = np.asarray(lk, dtype=np.int64) + rk = np.asarray(rk, dtype=np.int64) + return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna()) len_lk = len(self) From 164d9c39ce6dbf052ee31ec84310ef19f08a68ef Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 3 Sep 2023 20:00:11 +0200 Subject: [PATCH 06/16] Fix typing --- pandas/core/arrays/arrow/array.py | 14 +++++++------- pandas/core/arrays/categorical.py | 6 +++--- pandas/core/arrays/datetimelike.py | 4 +++- pandas/core/arrays/masked.py | 2 +- pandas/core/reshape/merge.py | 2 ++ 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 811734a265ab4..63a1f11021e69 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2556,7 +2556,7 @@ def _dt_tz_convert(self, tz): return type(self)(result) def _factorize_with_other( - self, other: ArrowExtensionArray, sort: bool = False + self, other: ArrowExtensionArray, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: if not isinstance(self.dtype, StringDtype) and ( pa.types.is_floating(self.dtype.pyarrow_dtype) @@ -2575,21 +2575,21 @@ def _factorize_with_other( return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna()) - len_lk = len(self) - lk = self._pa_array # type: ignore[attr-defined] - rk = other._pa_array # type: ignore[union-attr] + len_left = len(self) + left = self._pa_array + right = other._pa_array dc = ( - pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr] + pa.chunked_array(left.chunks + right.chunks) .combine_chunks() .dictionary_encode() ) length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) + pc.fill_null(dc.indices[slice(len_left)], length) .to_numpy() .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) + pc.fill_null(dc.indices[slice(len_left, None)], length) .to_numpy() .astype(np.intp, copy=False), len(dc.dictionary), diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 31ba73d340b72..7945441e43388 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2719,12 +2719,12 @@ def _groupby_op( return self._from_backing_data(res_values) def _factorize_with_other( - self, other: Categorical, sort: bool = False + self, other: Categorical, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - rk = self._encode_with_my_categories(other) + other = self._encode_with_my_categories(other) lk = ensure_int64(self.codes) - rk = ensure_int64(rk.codes) + rk = ensure_int64(other.codes) return factorize_with_rizer(lk, rk, sort) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 9e4842a92c922..bb2340433db2c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1705,7 +1705,9 @@ def _groupby_op( return self._from_backing_data(res_values) def _factorize_with_other( - self, other: DatetimeLikeArrayMixin, sort: bool = False + self, + other: DatetimeLikeArrayMixin, # type: ignore[override] + sort: bool = False, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: lk = np.asarray(self._ndarray, dtype=np.int64) rk = np.asarray(other._ndarray, dtype=np.int64) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e5e8648446cd9..ff98493be73bf 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1507,7 +1507,7 @@ def _groupby_op( return self._maybe_mask_result(res_values, result_mask) def _factorize_with_other( - self, other: BaseMaskedArray, sort: bool = False + self, other: BaseMaskedArray, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: return factorize_with_rizer( self._data, other._data, sort, self._mask, other._mask diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6db78b71e172d..839e8b05d404e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2378,11 +2378,13 @@ def _factorize_keys( lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk) if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: + assert isinstance(rk, ExtensionArray) llab, rlab, count = lk._factorize_with_other(rk, sort) else: lk, rk = _convert_arrays(lk, rk) if isinstance(lk, ExtensionArray): + assert isinstance(rk, ExtensionArray) llab, rlab, count = lk._factorize_with_other(rk, sort) else: llab, rlab, count = factorize_with_rizer(lk, rk, sort) From 5d7151dd8ae9784079495a64b3721678cd2a62a0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 7 Sep 2023 23:19:29 +0200 Subject: [PATCH 07/16] Update names --- pandas/core/arrays/arrow/array.py | 8 ++++---- pandas/core/arrays/base.py | 26 ++++---------------------- pandas/core/arrays/categorical.py | 8 ++++---- pandas/core/arrays/datetimelike.py | 10 ++++------ pandas/core/arrays/masked.py | 10 ++++------ pandas/core/reshape/merge.py | 8 ++++---- pandas/core/reshape/merge_utils.py | 18 ++++++++++++------ 7 files changed, 36 insertions(+), 52 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 63a1f11021e69..908c3e328f731 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -59,7 +59,7 @@ unpack_tuple_and_ellipses, validate_indices, ) -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -2555,8 +2555,8 @@ def _dt_tz_convert(self, tz): result = self._pa_array.cast(pa.timestamp(current_unit, tz)) return type(self)(result) - def _factorize_with_other( - self, other: ArrowExtensionArray, sort: bool = False # type: ignore[override] + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: if not isinstance(self.dtype, StringDtype) and ( pa.types.is_floating(self.dtype.pyarrow_dtype) @@ -2573,7 +2573,7 @@ def _factorize_with_other( lk = np.asarray(lk, dtype=np.int64) rk = np.asarray(rk, dtype=np.int64) - return factorize_with_rizer(lk, rk, sort, self.isna(), other.isna()) + return factorize_arrays(lk, rk, sort, self.isna(), other.isna()) len_left = len(self) left = self._pa_array diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3abf7740ff51a..7a303fccd7a06 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -24,7 +24,6 @@ from pandas._libs import ( algos as libalgos, - hashtable as libhashtable, lib, ) from pandas.compat import set_function_name @@ -70,7 +69,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.core.sorting import ( nargminmax, nargsort, @@ -104,23 +103,6 @@ from pandas import Index _extension_array_shared_docs: dict[str, str] = {} -_factorizers = { - np.int64: libhashtable.Int64Factorizer, - np.longlong: libhashtable.Int64Factorizer, - np.int32: libhashtable.Int32Factorizer, - np.int16: libhashtable.Int16Factorizer, - np.int8: libhashtable.Int8Factorizer, - np.uint64: libhashtable.UInt64Factorizer, - np.uint32: libhashtable.UInt32Factorizer, - np.uint16: libhashtable.UInt16Factorizer, - np.uint8: libhashtable.UInt8Factorizer, - np.bool_: libhashtable.UInt8Factorizer, - np.float64: libhashtable.Float64Factorizer, - np.float32: libhashtable.Float32Factorizer, - np.complex64: libhashtable.Complex64Factorizer, - np.complex128: libhashtable.Complex128Factorizer, - np.object_: libhashtable.ObjectFactorizer, -} class ExtensionArray: @@ -2285,12 +2267,12 @@ def _groupby_op( else: raise NotImplementedError - def _factorize_with_other( - self, other: ExtensionArray, sort: bool = False + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: lk, _ = self._values_for_factorize() rk, _ = other._values_for_factorize() - return factorize_with_rizer(lk, rk, sort) + return factorize_arrays(lk, rk, sort) class ExtensionArraySupportsAnyAll(ExtensionArray): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 28fc482cf3ae4..ee19fcf1c666d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -84,7 +84,7 @@ sanitize_array, ) from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.core.sorting import nargsort from pandas.core.strings.object_array import ObjectStringArrayMixin @@ -2718,14 +2718,14 @@ def _groupby_op( res_values[result_mask == 1] = -1 return self._from_backing_data(res_values) - def _factorize_with_other( - self, other: Categorical, sort: bool = False # type: ignore[override] + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: other = self._encode_with_my_categories(other) lk = ensure_int64(self.codes) rk = ensure_int64(other.codes) - return factorize_with_rizer(lk, rk, sort) + return factorize_arrays(lk, rk, sort) # The Series.cat accessor diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index bb2340433db2c..16fd3a1d8404e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -139,7 +139,7 @@ invalid_comparison, make_invalid_op, ) -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.tseries import frequencies @@ -1704,14 +1704,12 @@ def _groupby_op( res_values = res_values.view(self._ndarray.dtype) return self._from_backing_data(res_values) - def _factorize_with_other( - self, - other: DatetimeLikeArrayMixin, # type: ignore[override] - sort: bool = False, + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: lk = np.asarray(self._ndarray, dtype=np.int64) rk = np.asarray(other._ndarray, dtype=np.int64) - return factorize_with_rizer(lk, rk, sort) + return factorize_arrays(lk, rk, sort) class DatelikeOps(DatetimeLikeArrayMixin): diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index ff98493be73bf..5cdddb9278e26 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -85,7 +85,7 @@ ) from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays if TYPE_CHECKING: from collections.abc import ( @@ -1506,12 +1506,10 @@ def _groupby_op( # wrap in a MaskedArray return self._maybe_mask_result(res_values, result_mask) - def _factorize_with_other( - self, other: BaseMaskedArray, sort: bool = False # type: ignore[override] + def _factorize_with_other_for_merge( + self, other: Self, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - return factorize_with_rizer( - self._data, other._data, sort, self._mask, other._mask - ) + return factorize_arrays(self._data, other._data, sort, self._mask, other._mask) def transpose_homogeneous_masked_arrays( diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 839e8b05d404e..95431b497ecf0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -94,7 +94,7 @@ ) from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index -from pandas.core.reshape.merge_utils import factorize_with_rizer +from pandas.core.reshape.merge_utils import factorize_arrays from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -2379,15 +2379,15 @@ def _factorize_keys( if isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: assert isinstance(rk, ExtensionArray) - llab, rlab, count = lk._factorize_with_other(rk, sort) + llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort) else: lk, rk = _convert_arrays(lk, rk) if isinstance(lk, ExtensionArray): assert isinstance(rk, ExtensionArray) - llab, rlab, count = lk._factorize_with_other(rk, sort) + llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort) else: - llab, rlab, count = factorize_with_rizer(lk, rk, sort) + llab, rlab, count = factorize_arrays(lk, rk, sort) if how == "right": return rlab, llab, count diff --git a/pandas/core/reshape/merge_utils.py b/pandas/core/reshape/merge_utils.py index e4be47454f0cf..1ec9c81250815 100644 --- a/pandas/core/reshape/merge_utils.py +++ b/pandas/core/reshape/merge_utils.py @@ -42,14 +42,20 @@ def _sort_labels( return new_left, new_right -def factorize_with_rizer(lk, rk, sort: bool = False, lk_mask=None, rk_mask=None): - rizer = _factorizers[lk.dtype.type](max(len(lk), len(rk))) - llab = rizer.factorize(lk, mask=lk_mask) - rlab = rizer.factorize(rk, mask=rk_mask) - count = rizer.get_count() +def factorize_arrays( + lk: np.ndarray, + rk: np.ndarray, + sort: bool = False, + lk_mask: np.ndarray | None = None, + rk_mask: np.ndarray | None = None, +): + factorizer = _factorizers[lk.dtype.type](max(len(lk), len(rk))) + llab = factorizer.factorize(lk, mask=lk_mask) + rlab = factorizer.factorize(rk, mask=rk_mask) + count = factorizer.get_count() if sort: - uniques = rizer.uniques.to_array() + uniques = factorizer.uniques.to_array() llab, rlab = _sort_labels(uniques, llab, rlab) lmask = llab == -1 From de02cb0bc8c1db1240e82eff2f8fc7466aaff1e3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 7 Sep 2023 23:24:14 +0200 Subject: [PATCH 08/16] Add docs --- doc/source/reference/extensions.rst | 1 + pandas/core/arrays/base.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index e177e2b1d87d5..cf50be8508f23 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -34,6 +34,7 @@ objects. api.extensions.ExtensionArray._accumulate api.extensions.ExtensionArray._concat_same_type + api.extensions.ExtensionArray._factorize_with_other_for_merge api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7a303fccd7a06..9d4867c8d6652 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2270,6 +2270,20 @@ def _groupby_op( def _factorize_with_other_for_merge( self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + """Factorizes two arrays to get codes for merge operations. + + This allows extension array authors to implement efficient factorizations + for merge operations. + + Parameters + ---------- + other : ExtensionArray with the same dtype as self. + sort : Whether to sort the result. + + Returns + ------- + tuple of codes for left and right and the number of unique elements. + """ lk, _ = self._values_for_factorize() rk, _ = other._values_for_factorize() return factorize_arrays(lk, rk, sort) From db84cfc5457ea8637cb113d38ce58dce37e18a14 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 7 Sep 2023 23:35:21 +0200 Subject: [PATCH 09/16] Fix --- pandas/core/arrays/categorical.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ee19fcf1c666d..bcfabebda3116 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2722,10 +2722,7 @@ def _factorize_with_other_for_merge( self, other: Self, sort: bool = False # type: ignore[override] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: other = self._encode_with_my_categories(other) - - lk = ensure_int64(self.codes) - rk = ensure_int64(other.codes) - return factorize_arrays(lk, rk, sort) + return factorize_arrays(self.codes, other.codes, sort) # The Series.cat accessor From 84ff6c11867c51478594c0dab396916ac34bea4a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 8 Sep 2023 21:56:59 +0200 Subject: [PATCH 10/16] Fix typing --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/categorical.py | 6 +++--- pandas/core/arrays/masked.py | 2 +- pandas/core/reshape/merge.py | 1 + 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 908c3e328f731..58eb081ede896 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2556,7 +2556,7 @@ def _dt_tz_convert(self, tz): return type(self)(result) def _factorize_with_other_for_merge( - self, other: Self, sort: bool = False # type: ignore[override] + self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: if not isinstance(self.dtype, StringDtype) and ( pa.types.is_floating(self.dtype.pyarrow_dtype) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bcfabebda3116..192b15018a644 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2719,10 +2719,10 @@ def _groupby_op( return self._from_backing_data(res_values) def _factorize_with_other_for_merge( - self, other: Self, sort: bool = False # type: ignore[override] + self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - other = self._encode_with_my_categories(other) - return factorize_arrays(self.codes, other.codes, sort) + rk = self._encode_with_my_categories(other) + return factorize_arrays(self.codes, rk.codes, sort) # The Series.cat accessor diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 5cdddb9278e26..b8b52f5249e72 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1507,7 +1507,7 @@ def _groupby_op( return self._maybe_mask_result(res_values, result_mask) def _factorize_with_other_for_merge( - self, other: Self, sort: bool = False # type: ignore[override] + self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: return factorize_arrays(self._data, other._data, sort, self._mask, other._mask) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 95431b497ecf0..a99ffc8252cee 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2387,6 +2387,7 @@ def _factorize_keys( assert isinstance(rk, ExtensionArray) llab, rlab, count = lk._factorize_with_other_for_merge(rk, sort) else: + assert isinstance(rk, np.ndarray) llab, rlab, count = factorize_arrays(lk, rk, sort) if how == "right": From 0e71264832b85471686fefba1b04de09cbf04bb2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:49:44 +0200 Subject: [PATCH 11/16] Add null count --- pandas/core/arrays/arrow/array.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c73337cd7cce5..c7e37823c6d7d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2676,6 +2676,8 @@ def _factorize_with_other_for_merge( .astype(np.intp, copy=False), len(dc.dictionary), ) + if dc.null_count > 0: + count += 1 return llab, rlab, count From 4238a8cb2f912445d3e02558d897961754ab27b6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:53:53 +0200 Subject: [PATCH 12/16] Update --- pandas/core/arrays/arrow/array.py | 17 ++++++++--------- pandas/core/arrays/base.py | 1 + pandas/core/arrays/datetimelike.py | 1 + 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c7e37823c6d7d..2d4c062861c76 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2648,6 +2648,8 @@ def _factorize_with_other_for_merge( or pa.types.is_date(self.dtype.pyarrow_dtype) or pa.types.is_timestamp(self.dtype.pyarrow_dtype) ): + # Keep using current logic + # TODO: check if new implementation with dictionary_encode is faster lk = self.to_numpy(na_value=1, dtype=self.dtype.numpy_dtype) rk = other.to_numpy(na_value=1, dtype=lk.dtype) @@ -2667,15 +2669,12 @@ def _factorize_with_other_for_merge( ) length = len(dc.dictionary) - llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_left)], length) - .to_numpy() - .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_left, None)], length) - .to_numpy() - .astype(np.intp, copy=False), - len(dc.dictionary), - ) + llab = pc.fill_null(dc.indices[slice(len_left)], length) + llab = llab.to_numpy().astype(np.intp, copy=False) + rlab = pc.fill_null(dc.indices[slice(len_left, None)], length) + rlab = rlab.to_numpy().astype(np.intp, copy=False) + count = len(dc.dictionary) + if dc.null_count > 0: count += 1 return llab, rlab, count diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 13d8b3962839e..3ed6ef04cadb4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2365,6 +2365,7 @@ def _factorize_with_other_for_merge( ------- tuple of codes for left and right and the number of unique elements. """ + # self and other have equal dtypes after _values_for_factorize lk, _ = self._values_for_factorize() rk, _ = other._values_for_factorize() return factorize_arrays(lk, rk, sort) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 9eb3ddefab87d..646c62accdb00 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1708,6 +1708,7 @@ def _groupby_op( def _factorize_with_other_for_merge( self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + # caller is responsible for ensuring self.dtype == other.dtype lk = np.asarray(self._ndarray, dtype=np.int64) rk = np.asarray(other._ndarray, dtype=np.int64) return factorize_arrays(lk, rk, sort) From dea9add8ad4b49e07b69e4b0746cc5d513863f97 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:58:26 +0200 Subject: [PATCH 13/16] Fix docstring --- ci/code_checks.sh | 3 +++ pandas/core/arrays/base.py | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6caa39ae42926..42af688340a9f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -1,3 +1,6 @@ + + + #!/bin/bash # # Run checks related to code quality. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3ed6ef04cadb4..d9df509299707 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2351,15 +2351,16 @@ def _groupby_op( def _factorize_with_other_for_merge( self, other: Self, sort: bool = False ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - """Factorizes two arrays to get codes for merge operations. + """ + Factorize two arrays to get codes for merge operations. This allows extension array authors to implement efficient factorizations for merge operations. Parameters ---------- - other : ExtensionArray with the same dtype as self. - sort : Whether to sort the result. + other : ExtensionArray with the same dtype as self + sort : Whether to sort the result Returns ------- From 8f932836d7ad85cb8451bc31d9dad97b40b535f7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 23:40:43 +0200 Subject: [PATCH 14/16] Add whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ef59c86a21598..36b211ae21073 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -78,6 +78,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) +- :meth:`ExtensionArray._factorize_with_other_for_merge` interface method added to allow efficient extension type factorize implementations for merge (:issue:`54975`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) From eb65303f7e0258f36454f9df137460dbbf7be91a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 16 Oct 2023 18:44:09 +0200 Subject: [PATCH 15/16] Update code_checks.sh --- ci/code_checks.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 42af688340a9f..6caa39ae42926 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -1,6 +1,3 @@ - - - #!/bin/bash # # Run checks related to code quality. From dc5f2cafaf5394ef64b6603c9ae249e8e516efd8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 21 Nov 2023 21:30:37 +0100 Subject: [PATCH 16/16] Update docs --- pandas/core/arrays/base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0509d24867c65..7799d44afc91b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2401,6 +2401,10 @@ def _factorize_with_other_for_merge( Returns ------- tuple of codes for left and right and the number of unique elements. + + Examples + -------- + This is a factorize of 2 different arrays. """ # self and other have equal dtypes after _values_for_factorize lk, _ = self._values_for_factorize()