From 7aa2edf58234242c25e1a9de268c0bf81988e585 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 15 Nov 2023 07:36:59 -0500 Subject: [PATCH 1/6] improve perf of index assertions --- pandas/_testing/asserters.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 8e49fcfb355fa..9e32deca435ee 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -43,7 +43,6 @@ Series, TimedeltaIndex, ) -from pandas.core.algorithms import take_nd from pandas.core.arrays import ( DatetimeArray, ExtensionArray, @@ -246,13 +245,6 @@ def _check_types(left, right, obj: str = "Index") -> None: assert_attr_equal("dtype", left, right, obj=obj) - def _get_ilevel_values(index, level): - # accept level number only - unique = index.levels[level] - level_codes = index.codes[level] - filled = take_nd(unique._values, level_codes, fill_value=unique._na_value) - return unique._shallow_copy(filled, name=index.names[level]) - # instance validation _check_isinstance(left, right, Index) @@ -299,9 +291,8 @@ def _get_ilevel_values(index, level): ) assert_numpy_array_equal(left.codes[level], right.codes[level]) except AssertionError: - # cannot use get_level_values here because it can change dtype - llevel = _get_ilevel_values(left, level) - rlevel = _get_ilevel_values(right, level) + llevel = left.get_level_values(level) + rlevel = right.get_level_values(level) assert_index_equal( llevel, @@ -328,7 +319,7 @@ def _get_ilevel_values(index, level): diff = np.sum(mismatch.astype(int)) * 100.0 / len(left) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) - else: + elif not left.equals(right): # if we have "equiv", this becomes True exact_bool = bool(exact) _testing.assert_almost_equal( @@ -592,7 +583,7 @@ def raise_assert_detail( {message}""" if isinstance(index_values, Index): - index_values = np.array(index_values) + index_values = np.asarray(index_values) if isinstance(index_values, np.ndarray): msg += f"\n[index]: {pprint_thing(index_values)}" From 896bb749a5dac3371de417dd4cc45a7b9808b564 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 15 Nov 2023 07:51:27 -0500 Subject: [PATCH 2/6] whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9040dba238c88..c6e2be0387aa0 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -305,7 +305,7 @@ Other Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` for objects indexed by a :class:`MultiIndex` (:issue:`55949`) +- Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` (:issue:`55949`, :issue:`55971`) - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`merge_asof` when ``by`` is not ``None`` (:issue:`55580`, :issue:`55678`) - Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`) From 3e5a379a4eacb3f1aaf22322dc414c6c20d032e9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 15 Nov 2023 19:57:12 -0500 Subject: [PATCH 3/6] faster _array_equivalent_object --- pandas/_testing/asserters.py | 2 +- pandas/core/dtypes/missing.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 9e32deca435ee..0d8fb7bfce33a 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -319,7 +319,7 @@ def _check_types(left, right, obj: str = "Index") -> None: diff = np.sum(mismatch.astype(int)) * 100.0 / len(left) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) - elif not left.equals(right): + else: # if we have "equiv", this becomes True exact_bool = bool(exact) _testing.assert_almost_equal( diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 8760c8eeca454..75d4e0d16bb66 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -567,7 +567,20 @@ def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bo return lib.array_equivalent_object(ensure_object(left), ensure_object(right)) - for left_value, right_value in zip(left, right): + mask = isna(left) & isna(right) + try: + if not lib.array_equivalent_object( + ensure_object(left[~mask]), + ensure_object(right[~mask]), + ): + return False + left_remaining = left[mask] + right_remaining = right[mask] + except ValueError: + left_remaining = left + right_remaining = right + + for left_value, right_value in zip(left_remaining, right_remaining): if left_value is NaT and right_value is not NaT: return False From 9b9483724fbabf076c7f986f89b39eb4c18a39f0 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 15 Nov 2023 20:04:08 -0500 Subject: [PATCH 4/6] add comment --- pandas/core/dtypes/missing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 75d4e0d16bb66..a375dd7ae289b 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -577,6 +577,8 @@ def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bo left_remaining = left[mask] right_remaining = right[mask] except ValueError: + # can raise a ValueError if left and right cannot be + # compared (e.g. nested arrays) left_remaining = left right_remaining = right From 3c941980a98e05ea0f905a9f2fe70f193bcbf4b7 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 16 Nov 2023 07:31:54 -0500 Subject: [PATCH 5/6] remove xfail --- pandas/tests/dtypes/test_missing.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index b995dc591c749..1cc1e2725b9c7 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -560,9 +560,7 @@ def test_array_equivalent_str(dtype): ) -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested(strict_nan): # reached in groupby aggregations, make sure we use np.any when checking # if the comparison is truthy @@ -585,9 +583,7 @@ def test_array_equivalent_nested(strict_nan): @pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning") -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested2(strict_nan): # more than one level of nesting left = np.array( @@ -612,9 +608,7 @@ def test_array_equivalent_nested2(strict_nan): assert not array_equivalent(left, right, strict_nan=strict_nan) -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested_list(strict_nan): left = np.array([[50, 70, 90], [20, 30]], dtype=object) right = np.array([[50, 70, 90], [20, 30]], dtype=object) From 1faabfdca4d17eb1f167162ef80a7f4628672ecd Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 17 Nov 2023 06:35:14 -0500 Subject: [PATCH 6/6] skip mask if not needed --- pandas/core/dtypes/missing.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index a375dd7ae289b..a635ac77566e1 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -562,17 +562,19 @@ def _array_equivalent_datetimelike(left: np.ndarray, right: np.ndarray): def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool): - if not strict_nan: - # isna considers NaN and None to be equivalent. + left = ensure_object(left) + right = ensure_object(right) - return lib.array_equivalent_object(ensure_object(left), ensure_object(right)) + mask: npt.NDArray[np.bool_] | None = None + if strict_nan: + mask = isna(left) & isna(right) + if not mask.any(): + mask = None - mask = isna(left) & isna(right) try: - if not lib.array_equivalent_object( - ensure_object(left[~mask]), - ensure_object(right[~mask]), - ): + if mask is None: + return lib.array_equivalent_object(left, right) + if not lib.array_equivalent_object(left[~mask], right[~mask]): return False left_remaining = left[mask] right_remaining = right[mask]