diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b22590759ea3f..598dc07da3478 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -567,10 +567,12 @@ Other API changes Deprecations ~~~~~~~~~~~~ +- Deprecated allowing array-like values for ``left_on`` or ``right_on`` other than ``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, and :class:`Series` (:issue:`48454`) - Deprecated argument ``infer_datetime_format`` in :func:`to_datetime` and :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`) - Deprecated :func:`pandas.io.sql.execute`(:issue:`50185`) - :meth:`Index.is_integer` has been deprecated. Use :func:`pandas.api.types.is_integer_dtype` instead (:issue:`50042`) - :meth:`Index.is_floating` has been deprecated. Use :func:`pandas.api.types.is_float_dtype` instead (:issue:`50042`) +- .. --------------------------------------------------------------------------- .. _whatsnew_200.prior_deprecations: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8d009d25a66ba..01574df25147d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1151,6 +1151,22 @@ def _get_merge_keys( is_lkey = lambda x: is_array_like(x) and len(x) == len(left) is_rkey = lambda x: is_array_like(x) and len(x) == len(right) + def deprecate_unknown_arraylike(obj) -> None: + # The existing code (but not docs) allow for any iterable object + # with a 'dtype' attribute. In the future, we want to restrict + # to specific array-like types. + if is_array_like(obj) and not isinstance( + obj, (np.ndarray, ExtensionArray, Index, ABCSeries) + ): + warnings.warn( + "In a future version, the 'on', 'left_on', and 'right_on' " + "keywords will only allow array-like objects that are one " + "of the following types: " + "numpy.ndarray, ExtensionArray, Index, Series.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A # user could, for example, request 'left_index' and 'left_by'. In a # regular pd.merge(), users cannot specify both 'left_index' and @@ -1164,6 +1180,8 @@ def _get_merge_keys( # ugh, spaghetti re #733 if _any(self.left_on) and _any(self.right_on): for lk, rk in zip(self.left_on, self.right_on): + deprecate_unknown_arraylike(lk) + deprecate_unknown_arraylike(rk) if is_lkey(lk): lk = cast(AnyArrayLike, lk) left_keys.append(lk) @@ -1214,6 +1232,7 @@ def _get_merge_keys( join_names.append(left.index.name) elif _any(self.left_on): for k in self.left_on: + deprecate_unknown_arraylike(k) if is_lkey(k): k = cast(AnyArrayLike, k) left_keys.append(k) @@ -1235,6 +1254,7 @@ def _get_merge_keys( right_keys = [self.right.index._values] elif _any(self.right_on): for k in self.right_on: + deprecate_unknown_arraylike(k) if is_rkey(k): k = cast(AnyArrayLike, k) right_keys.append(k) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index fc2069c5d1e42..3abc59dbd46b2 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,6 +8,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import ( is_categorical_dtype, is_object_dtype, @@ -2720,6 +2722,25 @@ def test_merge_different_index_names(): tm.assert_frame_equal(result, expected) +@td.skip_if_no("dask") +def test_merge_on_arraylike_deprecation(): + # deprecate allowing non-standard array-likes for "on" + + left = DataFrame({"A": range(3), "B": range(1, 4)}) + right = DataFrame({"C": range(2, 5)}) + + import dask.array + + arr = dask.array.array([0, 1, 2]) # matches left["A"] + + msg = "will only allow array-like objects that are one of the following types" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = merge(left, right, left_on="A", right_on=arr) + + expected = merge(left, right, left_on="A", right_on=np.array(arr)) + tm.assert_frame_equal(res, expected) + + def test_merge_ea(any_numeric_ea_dtype, join_type): # GH#44240 left = DataFrame({"a": [1, 2, 3], "b": 1}, dtype=any_numeric_ea_dtype)