From d4d22292fe6c6edd4f4f033b2ed4f0135d51944a Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 11 Jul 2020 16:03:00 +0300 Subject: [PATCH 1/8] Placed the calculation of mask prior to the call of _compare_or_regex_search --- pandas/core/internals/managers.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c82670106d3b6..a573afa5cefaf 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -596,18 +596,24 @@ def replace_list( # figure out our mask apriori to avoid repeated replacements values = self.as_array() - def comp(s, regex=False): + def comp(s, regex=False, mask=None): """ Generate a bool array by perform an equality check, or perform an element-wise regular expression matching """ if isna(s): - return isna(values) + if mask is None: + return isna(values) + return ~mask s = com.maybe_box_datetimelike(s) - return _compare_or_regex_search(values, s, regex) + return _compare_or_regex_search(values, s, regex, mask) - masks = [comp(s, regex) for s in src_list] + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(values) + + masks = [comp(s, regex, mask) for s in src_list] result_blocks = [] src_len = len(src_list) - 1 @@ -1895,7 +1901,7 @@ def _merge_blocks( def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False + a: ArrayLike, b: Scalar, regex: bool = False, mask: ArrayLike = None ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1908,6 +1914,7 @@ def _compare_or_regex_search( a : array_like b : scalar regex : bool, default False + mask : array_like Returns ------- @@ -1941,7 +1948,7 @@ def _check_comparison_types( ) # GH#32621 use mask to avoid comparing to NAs - if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): + if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): mask = np.reshape(~(isna(a)), a.shape) if isinstance(a, np.ndarray): a = a[mask] From ef8940e084afd4e2e8239363a7d6afbc9d68bc91 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 11 Jul 2020 16:54:37 +0300 Subject: [PATCH 2/8] Fix argument type --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a573afa5cefaf..a0d4f664895c4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1901,7 +1901,7 @@ def _merge_blocks( def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False, mask: ArrayLike = None + a: ArrayLike, b: Scalar, regex: bool = False, mask: Union[ArrayLike, None] = None ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1960,7 +1960,7 @@ def _check_comparison_types( result = op(a) - if isinstance(result, np.ndarray): + if isinstance(result, np.ndarray) and mask is not None: # The shape of the mask can differ to that of the result # since we may compare only a subset of a's or b's elements tmp = np.zeros(mask.shape, dtype=np.bool_) From 8d47535aa08a06e983cd83e02e0a5decf91eee31 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 11 Jul 2020 17:42:43 +0300 Subject: [PATCH 3/8] Added entry in the whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..ae9d5aded46e7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -856,6 +856,7 @@ Performance improvements - Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`) - Performance improvement in `DataFrame[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) - Significant performance improvement of :meth:`io.formats.style.Styler.render` with styles added with various ways such as :meth:`io.formats.style.Styler.apply`, :meth:`io.formats.style.Styler.applymap` or :meth:`io.formats.style.Styler.bar` (:issue:`19917`) +- Performance improvement in the :func:`Series.replace` when `to_replace` is a dict (:issue:`33920`) .. --------------------------------------------------------------------------- From 799e24a4309e02257005a631f7533920705c27cb Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 11 Jul 2020 19:20:55 +0300 Subject: [PATCH 4/8] Updated argument description --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a0d4f664895c4..1f3c3f676d15b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1914,7 +1914,7 @@ def _compare_or_regex_search( a : array_like b : scalar regex : bool, default False - mask : array_like + mask : array_like or None (default) Returns ------- From 4301d57d4c9af6f0ca43fe41d608b5449c33a7ad Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Tue, 14 Jul 2020 17:20:03 +0300 Subject: [PATCH 5/8] Fix arg ordering and types --- pandas/core/internals/managers.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1f3c3f676d15b..df66c7b4bb737 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -596,14 +596,12 @@ def replace_list( # figure out our mask apriori to avoid repeated replacements values = self.as_array() - def comp(s, regex=False, mask=None): + def comp(s, mask, regex: bool = False): """ Generate a bool array by perform an equality check, or perform an element-wise regular expression matching """ if isna(s): - if mask is None: - return isna(values) return ~mask s = com.maybe_box_datetimelike(s) @@ -613,7 +611,7 @@ def comp(s, regex=False, mask=None): # in order to avoid repeating the same computations mask = ~isna(values) - masks = [comp(s, regex, mask) for s in src_list] + masks = [comp(s, mask, regex) for s in src_list] result_blocks = [] src_len = len(src_list) - 1 @@ -1901,7 +1899,7 @@ def _merge_blocks( def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False, mask: Union[ArrayLike, None] = None + a: ArrayLike, b: Scalar, regex: bool = False, mask: Optional[ArrayLike] = None ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values From ec878a171602df46ab1719bbc132416d882b05d3 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Wed, 15 Jul 2020 11:19:19 +0300 Subject: [PATCH 6/8] Update argument types in comp --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index df66c7b4bb737..fdf72a1e3db13 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -596,7 +596,7 @@ def replace_list( # figure out our mask apriori to avoid repeated replacements values = self.as_array() - def comp(s, mask, regex: bool = False): + def comp(s: Scalar, mask: ArrayLike, regex: bool = False): """ Generate a bool array by perform an equality check, or perform an element-wise regular expression matching From dc2a00bb19d2dd635c3da98f2e5d7799e57ad142 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Wed, 15 Jul 2020 11:25:46 +0300 Subject: [PATCH 7/8] Update v1.1.0.rst --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 31b45d691844b..3faca9c8868ca 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -832,7 +832,6 @@ Performance improvements - Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`) - Performance improvement in `DataFrame[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) - Significant performance improvement of :meth:`io.formats.style.Styler.render` with styles added with various ways such as :meth:`io.formats.style.Styler.apply`, :meth:`io.formats.style.Styler.applymap` or :meth:`io.formats.style.Styler.bar` (:issue:`19917`) -- Performance improvement in the :func:`Series.replace` when `to_replace` is a dict (:issue:`33920`) .. --------------------------------------------------------------------------- From 7eef4aa138653addc9dd1029af17ae18425134fe Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Wed, 15 Jul 2020 11:47:06 +0300 Subject: [PATCH 8/8] Correct mask arg type --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index fdf72a1e3db13..d5947726af7fd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -596,7 +596,7 @@ def replace_list( # figure out our mask apriori to avoid repeated replacements values = self.as_array() - def comp(s: Scalar, mask: ArrayLike, regex: bool = False): + def comp(s: Scalar, mask: np.ndarray, regex: bool = False): """ Generate a bool array by perform an equality check, or perform an element-wise regular expression matching