From 519ea798658ff0aca8877434affbc0414ce81936 Mon Sep 17 00:00:00 2001 From: ptth222 Date: Thu, 6 Mar 2025 19:20:41 -0500 Subject: [PATCH 01/11] Update _arrow_string_mixins.py Address #61072. --- pandas/core/arrays/_arrow_string_mixins.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1ca52ce64bd77..145d848a9e38c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -321,8 +321,12 @@ def _str_fullmatch( flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" + if not pat.endswith("$") and not pat.startswith("^"): + pat = f"^({pat})$" + elif not pat.endswith("$"): + pat = f"^({pat[1:]})$" + elif not pat.startswith("^"): + pat = f"^({pat[0:-1]})$" return self._str_match(pat, case, flags, na) def _str_find(self, sub: str, start: int = 0, end: int | None = None): From 93ad579ee5e54626b412196247c05bbddcb44959 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Mon, 15 Sep 2025 13:35:40 -0400 Subject: [PATCH 02/11] Updated _arrow_string_mixins.py Made the changes suggested by @jorisvandenbossche. --- pandas/core/arrays/_arrow_string_mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 145d848a9e38c..d62bf7c58f66d 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -321,9 +321,9 @@ def _str_fullmatch( flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): - if not pat.endswith("$") and not pat.startswith("^"): + if (not pat.endswith("$") or pat.endswith("\\$")) and not pat.startswith("^"): pat = f"^({pat})$" - elif not pat.endswith("$"): + elif not pat.endswith("$") or pat.endswith("\\$"): pat = f"^({pat[1:]})$" elif not pat.startswith("^"): pat = f"^({pat[0:-1]})$" From f99bcd7850d46df67c374e788f1fca95fd40e258 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Tue, 16 Sep 2025 17:39:36 -0400 Subject: [PATCH 03/11] Update test_arrow.py Added a test to confirm that the arrow implementation gives the same result as the python one. Also changed test_str_fullmatch to use the fullmatch method instead of the match method. --- pandas/tests/extension/test_arrow.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fbd3868f62899..01d9dbd38d2b9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1880,10 +1880,17 @@ def test_str_match(pat, case, na, exp): ) def test_str_fullmatch(pat, case, na, exp): ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) - result = ser.str.match(pat, case=case, na=na) + result = ser.str.fullmatch(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) +def test_str_fullmatch_against_python_fullmatch(pat, case, na): + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) + ser2 = pd.Series(["abc", "abc$", "$abc", None], dtype=str) + result = ser.str.fullmatch(pat, case=case, na=na) + result2 = ser2.str.fullmatch(pat, case=case, na=na) + tm.assert_series_equal(result, result2) + @pytest.mark.parametrize( "sub, start, end, exp, exp_type", From 46cb44024d0d9e7080498042b5ac05b64794f2e0 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Tue, 16 Sep 2025 18:49:27 -0400 Subject: [PATCH 04/11] Update test_arrow.py There were errors, so I changed the tests to try and make them pass. --- pandas/tests/extension/test_arrow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 01d9dbd38d2b9..813d4b469ca50 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1868,8 +1868,8 @@ def test_str_match(pat, case, na, exp): ["abc", False, None, [True, True, False, None]], ["Abc", True, None, [False, False, False, None]], ["bc", True, None, [False, False, False, None]], - ["ab", False, None, [True, True, False, None]], - ["a[a-z]{2}", False, None, [True, True, False, None]], + ["ab", False, None, [False, False, False, None]], + ["a[a-z]{2}", False, None, [True, False, False, None]], ["A[a-z]{1}", True, None, [False, False, False, None]], # GH Issue: #56652 ["abc$", False, None, [True, False, False, None]], @@ -1884,7 +1884,7 @@ def test_str_fullmatch(pat, case, na, exp): expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) -def test_str_fullmatch_against_python_fullmatch(pat, case, na): +def test_str_fullmatch_against_python_fullmatch(pat, case, na, exp): ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) ser2 = pd.Series(["abc", "abc$", "$abc", None], dtype=str) result = ser.str.fullmatch(pat, case=case, na=na) From cd1ebce7fcf23cd493de8836998ae830755b18a0 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Tue, 16 Sep 2025 19:50:16 -0400 Subject: [PATCH 05/11] Update test_arrow.py Had to change expected result again because I missed one previously. Also had to change test structure to reuse parametrize. --- pandas/tests/extension/test_arrow.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 813d4b469ca50..dbb1136105d3f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1865,7 +1865,7 @@ def test_str_match(pat, case, na, exp): @pytest.mark.parametrize( "pat, case, na, exp", [ - ["abc", False, None, [True, True, False, None]], + ["abc", False, None, [True, False, False, None]], ["Abc", True, None, [False, False, False, None]], ["bc", True, None, [False, False, False, None]], ["ab", False, None, [False, False, False, None]], @@ -1878,18 +1878,19 @@ def test_str_match(pat, case, na, exp): ["Abc\\$", True, None, [False, False, False, None]], ], ) -def test_str_fullmatch(pat, case, na, exp): - ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) - result = ser.str.fullmatch(pat, case=case, na=na) - expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) - tm.assert_series_equal(result, expected) - -def test_str_fullmatch_against_python_fullmatch(pat, case, na, exp): - ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) - ser2 = pd.Series(["abc", "abc$", "$abc", None], dtype=str) - result = ser.str.fullmatch(pat, case=case, na=na) - result2 = ser2.str.fullmatch(pat, case=case, na=na) - tm.assert_series_equal(result, result2) +class TestFullmatch: + def test_str_fullmatch(self, pat, case, na, exp): + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) + result = ser.str.fullmatch(pat, case=case, na=na) + expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) + tm.assert_series_equal(result, expected) + + def test_str_fullmatch_against_python_fullmatch(self, pat, case, na, exp): + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) + ser2 = pd.Series(["abc", "abc$", "$abc", None], dtype=str) + result = ser.str.fullmatch(pat, case=case, na=na) + result2 = ser2.str.fullmatch(pat, case=case, na=na) + tm.assert_series_equal(result, result2) @pytest.mark.parametrize( From c8d60483d16dee4eae0d6307708b260d1f1f0965 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Tue, 16 Sep 2025 20:39:58 -0400 Subject: [PATCH 06/11] Update test_arrow.py There were arrows about the series types not being the same, so I tried to address that. --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index dbb1136105d3f..a6be86f91de67 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1889,7 +1889,7 @@ def test_str_fullmatch_against_python_fullmatch(self, pat, case, na, exp): ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) ser2 = pd.Series(["abc", "abc$", "$abc", None], dtype=str) result = ser.str.fullmatch(pat, case=case, na=na) - result2 = ser2.str.fullmatch(pat, case=case, na=na) + result2 = ser2.str.fullmatch(pat, case=case, na=na).astype(ArrowDtype(pa.bool_())) tm.assert_series_equal(result, result2) From 0796834db9ec6339fb44b56c60076a671ecacad0 Mon Sep 17 00:00:00 2001 From: Travis Thompson Date: Tue, 16 Sep 2025 22:34:38 -0400 Subject: [PATCH 07/11] Update test_arrow.py Trying again to get the result types correct so that the equal assertion works. --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a6be86f91de67..db17d9b8096b9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1889,7 +1889,7 @@ def test_str_fullmatch_against_python_fullmatch(self, pat, case, na, exp): ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) ser2 = pd.Series(["abc", "abc$", "$abc", None], dtype=str) result = ser.str.fullmatch(pat, case=case, na=na) - result2 = ser2.str.fullmatch(pat, case=case, na=na).astype(ArrowDtype(pa.bool_())) + result2 = ser2.str.fullmatch(pat, case=case, na=na).astype(result.dtype) tm.assert_series_equal(result, result2) From 70e82b10b26a27c32a524c92c8d66db78e564935 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Sep 2025 14:24:32 +0200 Subject: [PATCH 08/11] remove missing value in extra test --- pandas/tests/extension/test_arrow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1cc46caf41228..809db3c1ad216 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1890,10 +1890,10 @@ def test_str_fullmatch(self, pat, case, na, exp): result = ser.str.fullmatch(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) - + def test_str_fullmatch_against_python_fullmatch(self, pat, case, na, exp): - ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) - ser2 = pd.Series(["abc", "abc$", "$abc", None], dtype=str) + ser = pd.Series(["abc", "abc$", "$abc"], dtype=ArrowDtype(pa.string())) + ser2 = pd.Series(["abc", "abc$", "$abc"], dtype=str) result = ser.str.fullmatch(pat, case=case, na=na) result2 = ser2.str.fullmatch(pat, case=case, na=na).astype(result.dtype) tm.assert_series_equal(result, result2) From c424093911663d17b64d5d2d1791f438161cc2a2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Sep 2025 14:38:37 +0200 Subject: [PATCH 09/11] move test comparing with python to test_find_replace.py --- pandas/tests/extension/test_arrow.py | 20 +++++-------- pandas/tests/strings/test_find_replace.py | 35 +++++++++++++++++++++++ 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 809db3c1ad216..7f226c7522237 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1870,6 +1870,8 @@ def test_str_match(pat, case, na, exp): @pytest.mark.parametrize( "pat, case, na, exp", + # Note: keep cases in sync with + # pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases [ ["abc", False, None, [True, False, False, None]], ["Abc", True, None, [False, False, False, None]], @@ -1884,19 +1886,11 @@ def test_str_match(pat, case, na, exp): ["Abc\\$", True, None, [False, False, False, None]], ], ) -class TestFullmatch: - def test_str_fullmatch(self, pat, case, na, exp): - ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) - result = ser.str.fullmatch(pat, case=case, na=na) - expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) - tm.assert_series_equal(result, expected) - - def test_str_fullmatch_against_python_fullmatch(self, pat, case, na, exp): - ser = pd.Series(["abc", "abc$", "$abc"], dtype=ArrowDtype(pa.string())) - ser2 = pd.Series(["abc", "abc$", "$abc"], dtype=str) - result = ser.str.fullmatch(pat, case=case, na=na) - result2 = ser2.str.fullmatch(pat, case=case, na=na).astype(result.dtype) - tm.assert_series_equal(result, result2) +def test_str_fullmatch(pat, case, na, exp): + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) + result = ser.str.fullmatch(pat, case=case, na=na) + expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 027db8f5e9ec0..26616a969f407 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -1075,6 +1075,41 @@ def test_fullmatch_compiled_regex(any_string_dtype): values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE) +@pytest.mark.parametrize( + "pat, case, na, exp", + # Note: keep cases in sync with + # pandas/tests/extension/test_arrow.py::test_str_fullmatch + [ + ["abc", False, None, [True, False, False, None]], + ["Abc", True, None, [False, False, False, None]], + ["bc", True, None, [False, False, False, None]], + ["ab", False, None, [False, False, False, None]], + ["a[a-z]{2}", False, None, [True, False, False, None]], + ["A[a-z]{1}", True, None, [False, False, False, None]], + # GH Issue: #56652 + ["abc$", False, None, [True, False, False, None]], + ["abc\\$", False, None, [False, True, False, None]], + ["Abc$", True, None, [False, False, False, None]], + ["Abc\\$", True, None, [False, False, False, None]], + ], +) +def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp): + ser = Series(["abc", "abc$", "$abc", None], dtype=any_string_dtype) + result = ser.str.fullmatch(pat, case=case, na=na) + + if any_string_dtype == "str": + # NaN propagates as False + exp[-1] = False + expected_dtype = bool + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, False, np.nan, False], dtype=expected_dtype) + expected = Series(exp, dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # -------------------------------------------------------------------------------------- # str.findall # -------------------------------------------------------------------------------------- From cb6a9d2cab304d150a05ddae1f8ccbcd5241a563 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Sep 2025 15:08:22 +0200 Subject: [PATCH 10/11] add test case with optional groups --- pandas/tests/extension/test_arrow.py | 3 +++ pandas/tests/strings/test_find_replace.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7f226c7522237..36dd91195d241 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1884,6 +1884,9 @@ def test_str_match(pat, case, na, exp): ["abc\\$", False, None, [False, True, False, None]], ["Abc$", True, None, [False, False, False, None]], ["Abc\\$", True, None, [False, False, False, None]], + # https://github.com/pandas-dev/pandas/issues/61072 + ["(abc)|(abx)", True, None, [True, False, False, None]], + ["((abc)|(abx))", True, None, [True, False, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 26616a969f407..ec9ddc916a856 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -1091,6 +1091,9 @@ def test_fullmatch_compiled_regex(any_string_dtype): ["abc\\$", False, None, [False, True, False, None]], ["Abc$", True, None, [False, False, False, None]], ["Abc\\$", True, None, [False, False, False, None]], + # https://github.com/pandas-dev/pandas/issues/61072 + ["(abc)|(abx)", True, None, [True, False, False, None]], + ["((abc)|(abx))", True, None, [True, False, False, None]], ], ) def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp): From 6fd088de5e62d5e46c589a6d16ec34018d11758d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Sep 2025 15:10:33 +0200 Subject: [PATCH 11/11] add whatsnew --- doc/source/whatsnew/v2.3.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index aaed7544d9975..1184835ff3a1a 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -25,6 +25,7 @@ Bug fixes - Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g``) with the Arrow-backed dtype would raise an error (:issue:`57636`) - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch`` with a compiled regex and custom flags (:issue:`62240`) +- Fix :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`) .. --------------------------------------------------------------------------- .. _whatsnew_233.contributors: