diff --git a/doc/source/text.rst b/doc/source/text.rst index 3a4a57ff4da95..52e05c5d511bc 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -146,6 +146,25 @@ following code will cause trouble because of the regular expression meaning of # We need to escape the special character (for >1 len patterns) dollars.str.replace(r'-\$', '-') +The ``replace`` method can also take a callable as replacement. It is called +on every ``pat`` using :func:`re.sub`. The callable should expect one +positional argument (a regex object) and return a string. + +.. versionadded:: 0.20.0 + +.. ipython:: python + + # Reverse every lowercase alphabetic word + pat = r'[a-z]+' + repl = lambda m: m.group(0)[::-1] + pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(pat, repl) + + # Using regex groups + pat = r"(?P\w+) (?P\w+) (?P\w+)" + repl = lambda m: m.group('two').swapcase() + pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) + + Indexing with ``.str`` ---------------------- @@ -406,7 +425,7 @@ Method Summary :meth:`~Series.str.join`;Join strings in each element of the Series with passed separator :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex - :meth:`~Series.str.replace`;Replace occurrences of pattern/regex with some other string + :meth:`~Series.str.replace`;Replace occurrences of pattern/regex with some other string or the return value of a callable given the occurrence :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" :meth:`~Series.str.center`;Equivalent to ``str.center`` diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2db03724e564d..25dffc9a4960d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -23,6 +23,7 @@ New features ~~~~~~~~~~~~ - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. +- ``.str.replace`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3041b17b99b17..c48defe39a011 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -167,7 +167,17 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): try: convert = not all(mask) result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError): + except (TypeError, AttributeError) as e: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + if compat.PY2: + p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?' + else: + p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' + r'(?(3)required )positional arguments?') + + if len(e.args) >= 1 and re.search(p_err, e.args[0]): + raise e def g(x): try: @@ -303,8 +313,13 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): ---------- pat : string Character sequence or regular expression - repl : string - Replacement sequence + repl : string or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + + .. versionadded:: 0.20.0 + n : int, default -1 (all) Number of replacements to make from start case : boolean, default True @@ -315,12 +330,53 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): Returns ------- replaced : Series/Index of objects + + Examples + -------- + When ``repl`` is a string, every ``pat`` is replaced as with + :meth:`str.replace`. NaN value(s) in the Series are left as is. + + >>> Series(['foo', 'fuz', np.nan]).str.replace('f', 'b') + 0 boo + 1 buz + 2 NaN + dtype: object + + When ``repl`` is a callable, it is called on every ``pat`` using + :func:`re.sub`. The callable should expect one positional argument + (a regex object) and return a string. + + To get the idea: + + >>> Series(['foo', 'fuz', np.nan]).str.replace('f', repr) + 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo + 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz + 2 NaN + dtype: object + + Reverse every lowercase alphabetic word: + + >>> repl = lambda m: m.group(0)[::-1] + >>> Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) + 0 oof 123 + 1 rab zab + 2 NaN + dtype: object + + Using regex groups: + + >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" + >>> repl = lambda m: m.group('two').swapcase() + >>> Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) + 0 bAR + 1 NaN + dtype: object """ - # Check whether repl is valid (GH 13438) - if not is_string_like(repl): - raise TypeError("repl must be a string") - use_re = not case or len(pat) > 1 or flags + # Check whether repl is valid (GH 13438, GH 15055) + if not (is_string_like(repl) or callable(repl)): + raise TypeError("repl must be a string or callable") + use_re = not case or len(pat) > 1 or flags or callable(repl) if use_re: if not case: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bbcd856250c51..47b64eac33d0b 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -436,6 +436,43 @@ def test_replace(self): values = klass(data) self.assertRaises(TypeError, values.str.replace, 'a', repl) + def test_replace_callable(self): + # GH 15055 + values = Series(['fooBAD__barBAD', NA]) + + # test with callable + repl = lambda m: m.group(0).swapcase() + result = values.str.replace('[a-z][A-Z]{2}', repl, n=2) + exp = Series(['foObaD__baRbaD', NA]) + tm.assert_series_equal(result, exp) + + # test with wrong number of arguments, raising an error + if compat.PY2: + p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?' + else: + p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' + r'(?(3)required )positional arguments?') + + repl = lambda: None + with tm.assertRaisesRegexp(TypeError, p_err): + values.str.replace('a', repl) + + repl = lambda m, x: None + with tm.assertRaisesRegexp(TypeError, p_err): + values.str.replace('a', repl) + + repl = lambda m, x, y=None: None + with tm.assertRaisesRegexp(TypeError, p_err): + values.str.replace('a', repl) + + # test regex named groups + values = Series(['Foo Bar Baz', NA]) + pat = r"(?P\w+) (?P\w+) (?P\w+)" + repl = lambda m: m.group('middle').swapcase() + result = values.str.replace(pat, repl) + exp = Series(['bAR', NA]) + tm.assert_series_equal(result, exp) + def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd'])