Skip to content

Commit 6690762

Browse files
Backport PR #62283 on branch 2.3.x (BUG: fix pyarrow string regex replacement) (#62328)
Co-authored-by: Álvaro Kothe <[email protected]>
1 parent fd158d6 commit 6690762

File tree

4 files changed

+86
-3
lines changed

4 files changed

+86
-3
lines changed

doc/source/whatsnew/v2.3.3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ become the default string dtype in pandas 3.0. See
2222

2323
Bug fixes
2424
^^^^^^^^^
25+
- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
2526
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
2627
with a compiled regex and custom flags (:issue:`62240`)
2728

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,10 +168,20 @@ def _str_replace(
168168
flags: int = 0,
169169
regex: bool = True,
170170
) -> Self:
171-
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
171+
if (
172+
isinstance(pat, re.Pattern)
173+
or callable(repl)
174+
or not case
175+
or flags
176+
or (
177+
isinstance(repl, str)
178+
and (r"\g<" in repl or re.search(r"\\\d", repl) is not None)
179+
)
180+
):
172181
raise NotImplementedError(
173182
"replace is not supported with a re.Pattern, callable repl, "
174-
"case=False, or flags!=0"
183+
"case=False, flags!=0, or when the replacement string contains "
184+
"named group references (\\g<...>, \\d+)"
175185
)
176186

177187
func = pc.replace_substring_regex if regex else pc.replace_substring

pandas/core/arrays/string_arrow.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,17 @@ def _str_replace(
419419
flags: int = 0,
420420
regex: bool = True,
421421
):
422-
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
422+
if (
423+
isinstance(pat, re.Pattern)
424+
or callable(repl)
425+
or not case
426+
or flags
427+
or ( # substitution contains a named group pattern
428+
# https://docs.python.org/3/library/re.html
429+
isinstance(repl, str)
430+
and (r"\g<" in repl or re.search(r"\\\d", repl) is not None)
431+
)
432+
):
423433
return super()._str_replace(pat, repl, n, case, flags, regex)
424434

425435
return ArrowStringArrayMixin._str_replace(

pandas/tests/strings/test_find_replace.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,68 @@ def test_replace_callable_raises(any_string_dtype, repl):
594594
values.str.replace("a", repl, regex=True)
595595

596596

597+
@pytest.mark.parametrize(
598+
"repl, expected_list",
599+
[
600+
(
601+
r"\g<three> \g<two> \g<one>",
602+
["Three Two One", "Baz Bar Foo"],
603+
),
604+
(
605+
r"\g<3> \g<2> \g<1>",
606+
["Three Two One", "Baz Bar Foo"],
607+
),
608+
(
609+
r"\g<2>0",
610+
["Two0", "Bar0"],
611+
),
612+
(
613+
r"\g<2>0 \1",
614+
["Two0 One", "Bar0 Foo"],
615+
),
616+
],
617+
ids=[
618+
"named_groups_full_swap",
619+
"numbered_groups_full_swap",
620+
"single_group_with_literal",
621+
"mixed_group_reference_with_literal",
622+
],
623+
)
624+
@pytest.mark.parametrize("use_compile", [True, False])
625+
def test_replace_named_groups_regex_swap(
626+
any_string_dtype, use_compile, repl, expected_list
627+
):
628+
# GH#57636
629+
ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype)
630+
pattern = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
631+
if use_compile:
632+
pattern = re.compile(pattern)
633+
result = ser.str.replace(pattern, repl, regex=True)
634+
expected = Series(expected_list, dtype=any_string_dtype)
635+
tm.assert_series_equal(result, expected)
636+
637+
638+
@pytest.mark.parametrize(
639+
"repl",
640+
[
641+
r"\g<20>",
642+
r"\20",
643+
],
644+
)
645+
@pytest.mark.parametrize("use_compile", [True, False])
646+
def test_replace_named_groups_regex_swap_expected_fail(
647+
any_string_dtype, repl, use_compile
648+
):
649+
# GH#57636
650+
pattern = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
651+
if use_compile:
652+
pattern = re.compile(pattern)
653+
ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype)
654+
655+
with pytest.raises(re.error, match="invalid group reference"):
656+
ser.str.replace(pattern, repl, regex=True)
657+
658+
597659
def test_replace_callable_named_groups(any_string_dtype):
598660
# test regex named groups
599661
ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)

0 commit comments

Comments
 (0)