Skip to content

Commit 8eb75e2

Browse files
committed
fix(arrow): fix str.replace behaviour for named group
1 parent 7f670c1 commit 8eb75e2

File tree

3 files changed

+85
-3
lines changed

3 files changed

+85
-3
lines changed

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,20 @@ def _str_replace(
174174
flags: int = 0,
175175
regex: bool = True,
176176
) -> Self:
177-
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
177+
if (
178+
isinstance(pat, re.Pattern)
179+
or callable(repl)
180+
or not case
181+
or flags
182+
or (
183+
isinstance(repl, str)
184+
and (r"\g<" in repl or re.search(r"\\\d", repl) is not None)
185+
)
186+
):
178187
raise NotImplementedError(
179188
"replace is not supported with a re.Pattern, callable repl, "
180-
"case=False, or flags!=0"
189+
"case=False, flags!=0, or when the replacement string contains "
190+
"named group references (\\g<...>, \\d+)"
181191
)
182192

183193
func = pc.replace_substring_regex if regex else pc.replace_substring

pandas/core/arrays/string_arrow.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,17 @@ def _str_replace(
423423
flags: int = 0,
424424
regex: bool = True,
425425
):
426-
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
426+
if (
427+
isinstance(pat, re.Pattern)
428+
or callable(repl)
429+
or not case
430+
or flags
431+
or ( # substitution contains a named group pattern
432+
# https://docs.python.org/3/library/re.html
433+
isinstance(repl, str)
434+
and (r"\g<" in repl or re.search(r"\\\d", repl) is not None)
435+
)
436+
):
427437
return super()._str_replace(pat, repl, n, case, flags, regex)
428438

429439
return ArrowStringArrayMixin._str_replace(

pandas/tests/strings/test_find_replace.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,68 @@ def test_replace_callable_raises(any_string_dtype, repl):
592592
values.str.replace("a", repl, regex=True)
593593

594594

595+
@pytest.mark.parametrize(
596+
"repl, expected_list",
597+
[
598+
(
599+
r"\g<three> \g<two> \g<one>",
600+
["Three Two One", "Baz Bar Foo"],
601+
),
602+
(
603+
r"\g<3> \g<2> \g<1>",
604+
["Three Two One", "Baz Bar Foo"],
605+
),
606+
(
607+
r"\g<2>0",
608+
["Two0", "Bar0"],
609+
),
610+
(
611+
r"\g<2>0 \1",
612+
["Two0 One", "Bar0 Foo"],
613+
),
614+
],
615+
ids=[
616+
"named_groups_full_swap",
617+
"numbered_groups_full_swap",
618+
"single_group_with_literal",
619+
"mixed_group_reference_with_literal",
620+
],
621+
)
622+
@pytest.mark.parametrize("use_compile", [True, False])
623+
def test_replace_named_groups_regex_swap(
624+
any_string_dtype, use_compile, repl, expected_list
625+
):
626+
# GH#57636
627+
ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype)
628+
pattern = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
629+
if use_compile:
630+
pattern = re.compile(pattern)
631+
result = ser.str.replace(pattern, repl, regex=True)
632+
expected = Series(expected_list, dtype=any_string_dtype)
633+
tm.assert_series_equal(result, expected)
634+
635+
636+
@pytest.mark.parametrize(
637+
"repl",
638+
[
639+
r"\g<20>",
640+
r"\20",
641+
],
642+
)
643+
@pytest.mark.parametrize("use_compile", [True, False])
644+
def test_replace_named_groups_regex_swap_expected_fail(
645+
any_string_dtype, repl, use_compile
646+
):
647+
# GH#57636
648+
pattern = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
649+
if use_compile:
650+
pattern = re.compile(pattern)
651+
ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype)
652+
653+
with pytest.raises(re.error, match="invalid group reference"):
654+
ser.str.replace(pattern, repl, regex=True)
655+
656+
595657
def test_replace_callable_named_groups(any_string_dtype):
596658
# test regex named groups
597659
ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)

0 commit comments

Comments
 (0)