From 261209fc57751adc71b36931fb0ada50a39ac806 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 26 Jul 2025 11:52:27 +0200 Subject: [PATCH 1/7] BUG: fix .str.isdigit to honor unicode superscript for older pyarrow --- doc/source/whatsnew/v2.3.2.rst | 3 ++- pandas/core/arrays/_arrow_string_mixins.py | 7 +++++++ pandas/tests/strings/test_strings.py | 7 ++++--- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.3.2.rst b/doc/source/whatsnew/v2.3.2.rst index faa61cf4bd3bc..88bd63d8942ea 100644 --- a/doc/source/whatsnew/v2.3.2.rst +++ b/doc/source/whatsnew/v2.3.2.rst @@ -22,7 +22,8 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ -- +- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript + characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`) .. --------------------------------------------------------------------------- .. _whatsnew_232.contributors: diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 07cbf489cfe1c..ad91d60aae922 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -15,6 +15,7 @@ HAS_PYARROW, pa_version_under13p0, pa_version_under17p0, + pa_version_under21p0, ) if HAS_PYARROW: @@ -261,6 +262,12 @@ def _str_isdecimal(self): return self._convert_bool_result(result) def _str_isdigit(self): + if pa_version_under21p0: + # https://github.com/pandas-dev/pandas/issues/61466 + res_list = self._apply_elementwise(str.isdigit) + return self._convert_bool_result( + pa.chunked_array(res_list, type=pa.bool_()) + ) result = pc.utf8_is_digit(self._pa_array) return self._convert_bool_result(result) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 025f837982595..2ed00703212ca 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -240,8 +240,9 @@ def test_ismethods(method, expected, any_string_dtype): @pytest.mark.parametrize( "method, expected", [ - ("isnumeric", [False, True, True, False, True, True, False]), - ("isdecimal", [False, True, False, False, False, True, False]), + ("isnumeric", [False, True, True, True, False, True, True, False]), + ("isdecimal", [False, True, False, False, False, False, True, False]), + ("isdigit", [False, True, True, False, False, False, True, False]), ], ) def test_isnumeric_unicode(method, expected, any_string_dtype): @@ -250,7 +251,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 # noqa: RUF003 ser = Series( - ["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001 + ["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001 dtype=any_string_dtype, ) expected_dtype = ( From cf26a930329248a8856d105bfa1291030d85711d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Aug 2025 22:09:25 +0200 Subject: [PATCH 2/7] update test --- pandas/tests/strings/test_strings.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 6c751be0e31b5..932dc187932b0 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -6,12 +6,15 @@ import numpy as np import pytest +from pandas.compat import pa_version_under21p0 + from pandas import ( NA, DataFrame, Index, MultiIndex, Series, + StringDtype, option_context, ) import pandas._testing as tm @@ -264,6 +267,16 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series(expected, dtype=expected_dtype) + if ( + method == "isdigit" + and isinstance(ser.dtype, StringDtype) + and ser.dtype.storage == "pyarrow" + and not pa_version_under21p0 + ): + # known difference in behavior between python and pyarrow unicode handling + # pyarrow 21+ considers ¼ as a digit, while python does not + expected.iloc[3] = True + result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) From 8349551d191f0d54aff612ba193021a24c811c44 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 19 Aug 2025 20:52:39 +0200 Subject: [PATCH 3/7] update test --- pandas/tests/strings/test_strings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 932dc187932b0..24a15c86375a7 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -274,8 +274,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): and not pa_version_under21p0 ): # known difference in behavior between python and pyarrow unicode handling - # pyarrow 21+ considers ¼ as a digit, while python does not + # pyarrow 21+ considers ¼ and ፸ as a digit, while python does not expected.iloc[3] = True + expected.iloc[5] = True result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) From 7cd79a5ec48aeb5e2a59216a3ab5d4fadb8b54aa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 20 Aug 2025 09:23:53 +0200 Subject: [PATCH 4/7] update test --- pandas/tests/strings/test_strings.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 24a15c86375a7..036c3cc2d132a 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -282,8 +282,13 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): tm.assert_series_equal(result, expected) # compare with standard library - expected = [getattr(item, method)() for item in ser] - assert list(result) == expected + # (only for non-pyarrow storage given the above differences) + if any_string_dtype == "object" or ( + isinstance(any_string_dtype, StringDtype) + and any_string_dtype.storage == "python" + ): + expected = [getattr(item, method)() for item in ser] + assert list(result) == expected @pytest.mark.parametrize( From 71223d32e7007d66f4abd60fc16b4bc7e77e0bd3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Sep 2025 22:47:29 +0200 Subject: [PATCH 5/7] update docstring and add note about differences in behaviour --- pandas/core/strings/accessor.py | 13 +++++++++++-- pandas/tests/strings/test_strings.py | 6 ++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 21e6e2efbe778..45f5c3cb533a8 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3610,10 +3610,19 @@ def casefold(self): >>> s3 = pd.Series(['23', '³', '⅕', '']) >>> s3.str.isdigit() 0 True - 1 False - 2 False + 1 True + 2 True 3 False dtype: bool + + Notes + ----- + The exact behavior of this method, i.e. which unicode characters are + considered as digits, depends on the backend used for string operations, + and there can be small differences. + For example, Python considers the ³ superscript character as a digit, but + not the ⅕ fraction character, while PyArrow considers both as digits. For + simple (ascii) decimal numbers, the behaviour is consistent. """ _shared_docs["isspace"] = """ diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index fc0dd23334706..20dbb6068ae08 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -7,8 +7,10 @@ import numpy as np import pytest -from pandas.compat import pa_version_under21p0 -from pandas.errors import Pandas4Warning +from pandas.compat import ( + Pandas4Warning, + pa_version_under21p0, +) from pandas import ( NA, From c2318fb3ccf9a1515b19443129aaaebfc932cf8e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Sep 2025 23:00:46 +0200 Subject: [PATCH 6/7] fixup merge --- pandas/tests/strings/test_strings.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 20dbb6068ae08..fc0dd23334706 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -7,10 +7,8 @@ import numpy as np import pytest -from pandas.compat import ( - Pandas4Warning, - pa_version_under21p0, -) +from pandas.compat import pa_version_under21p0 +from pandas.errors import Pandas4Warning from pandas import ( NA, From 8220a4a60ed107464ab4dd9c84cfaaa6631febf5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 25 Sep 2025 14:28:59 +0200 Subject: [PATCH 7/7] switch order of docstring sections --- pandas/core/strings/accessor.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 45f5c3cb533a8..b78ea3a9bf883 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3602,11 +3602,21 @@ def casefold(self): Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. - Examples - -------- + Notes + ----- Similar to ``str.isdecimal`` but also includes special digits, like superscripted and subscripted digits in unicode. + The exact behavior of this method, i.e. which unicode characters are + considered as digits, depends on the backend used for string operations, + and there can be small differences. + For example, Python considers the ³ superscript character as a digit, but + not the ⅕ fraction character, while PyArrow considers both as digits. For + simple (ascii) decimal numbers, the behaviour is consistent. + + Examples + -------- + >>> s3 = pd.Series(['23', '³', '⅕', '']) >>> s3.str.isdigit() 0 True @@ -3614,15 +3624,6 @@ def casefold(self): 2 True 3 False dtype: bool - - Notes - ----- - The exact behavior of this method, i.e. which unicode characters are - considered as digits, depends on the backend used for string operations, - and there can be small differences. - For example, Python considers the ³ superscript character as a digit, but - not the ⅕ fraction character, while PyArrow considers both as digits. For - simple (ascii) decimal numbers, the behaviour is consistent. """ _shared_docs["isspace"] = """