Skip to content

Commit c6f2c4f

Browse files
committed
DEPR: .str accessor with object dtype
1 parent cb7b334 commit c6f2c4f

File tree

19 files changed

+162
-70
lines changed

19 files changed

+162
-70
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,7 @@ Other Deprecations
621621
- Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`)
622622
- Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`)
623623
- Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`)
624+
- Deprecated the ``.str`` accessor for ``object`` dtype :class:`Series`; explicitly cast to ``"str"`` dtype before using the accessor instead (:issue:`29710`)
624625
- Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`)
625626
- Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`)
626627

pandas/core/accessor.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@ def _dir_additions(self) -> set[str]:
4141
"""
4242
Add additional __dir__ for this object.
4343
"""
44-
return {accessor for accessor in self._accessors if hasattr(self, accessor)}
44+
with warnings.catch_warnings():
45+
# Don't issue warning about .str accessor on object dtype
46+
warnings.filterwarnings("ignore")
47+
return {accessor for accessor in self._accessors if hasattr(self, accessor)}
4548

4649
def __dir__(self) -> list[str]:
4750
"""

pandas/core/strings/accessor.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from pandas.core.dtypes.missing import isna
5050

5151
from pandas.core.arrays import ExtensionArray
52+
from pandas.core.arrays.string_ import StringDtype
5253
from pandas.core.base import NoNewAttributesMixin
5354
from pandas.core.construction import extract_array
5455

@@ -203,8 +204,6 @@ class StringMethods(NoNewAttributesMixin):
203204
# * extractall
204205

205206
def __init__(self, data) -> None:
206-
from pandas.core.arrays.string_ import StringDtype
207-
208207
self._inferred_dtype = self._validate(data)
209208
self._is_categorical = isinstance(data.dtype, CategoricalDtype)
210209
self._is_string = isinstance(data.dtype, StringDtype)
@@ -255,6 +254,14 @@ def _validate(data):
255254
data = extract_array(data)
256255

257256
values = getattr(data, "categories", data) # categorical / normal
257+
if data.dtype == object and get_option("future.infer_string"):
258+
warnings.warn(
259+
# GH#29710
260+
".str accessor on object dtype is deprecated. Explicitly cast "
261+
"to 'str' dtype instead.",
262+
FutureWarning,
263+
stacklevel=find_stack_level(),
264+
)
258265

259266
inferred_dtype = lib.infer_dtype(values, skipna=True)
260267

@@ -3875,7 +3882,6 @@ def _result_dtype(arr):
38753882
# workaround #27953
38763883
# ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
38773884
# when the list of values is empty.
3878-
from pandas.core.arrays.string_ import StringDtype
38793885

38803886
if isinstance(arr.dtype, (ArrowDtype, StringDtype)):
38813887
return arr.dtype

pandas/io/pytables.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5278,11 +5278,16 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
52785278
"""
52795279
# encode if needed
52805280
if len(data):
5281-
data = (
5282-
Series(data.ravel(), copy=False, dtype="object")
5283-
.str.encode(encoding, errors)
5284-
._values.reshape(data.shape)
5281+
# We can _almost_ do ser.astype("str").str.encode(encoding, errors)
5282+
# But the conversion to "str" can fail in e.g. test_to_hdf_errors
5283+
ser = Series(data.ravel(), copy=False, dtype="object")
5284+
arr = np.asarray(ser)
5285+
func = lambda x: x.encode(encoding, errors=errors)
5286+
mask = isna(arr)
5287+
result = lib.map_infer_mask(
5288+
arr, func, mask.view(np.uint8), convert=not np.all(mask)
52855289
)
5290+
data = result.reshape(data.shape)
52865291

52875292
# create the sized dtype
52885293
ensured = ensure_object(data.ravel())
@@ -5319,9 +5324,13 @@ def _unconvert_string_array(
53195324
dtype = f"U{itemsize}"
53205325

53215326
if isinstance(data[0], bytes):
5322-
ser = Series(data, copy=False).str.decode(
5323-
encoding, errors=errors, dtype="object"
5324-
)
5327+
with warnings.catch_warnings():
5328+
warnings.filterwarnings(
5329+
"ignore", ".str accessor on object dtype is deprecated"
5330+
)
5331+
ser = Series(data, copy=False).str.decode(
5332+
encoding, errors=errors, dtype="object"
5333+
)
53255334
data = ser.to_numpy()
53265335
data.flags.writeable = True
53275336
else:

pandas/io/sas/sas7bdat.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from datetime import datetime
2020
import sys
2121
from typing import TYPE_CHECKING
22+
import warnings
2223

2324
import numpy as np
2425

@@ -717,7 +718,11 @@ def _chunk_to_dataframe(self) -> DataFrame:
717718
elif self._column_types[j] == b"s":
718719
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False)
719720
if self.convert_text and (self.encoding is not None):
720-
rslt[name] = self._decode_string(rslt[name].str)
721+
with warnings.catch_warnings():
722+
warnings.filterwarnings(
723+
"ignore", ".str accessor on object dtype is deprecated"
724+
)
725+
rslt[name] = self._decode_string(rslt[name].str)
721726
if infer_string:
722727
rslt[name] = rslt[name].astype("str")
723728

pandas/io/stata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2744,7 +2744,7 @@ def _encode_strings(self) -> None:
27442744
types cannot be exported and must first be converted to one of the
27452745
supported types."""
27462746
)
2747-
encoded = self.data[col].str.encode(self._encoding)
2747+
encoded = self.data[col].astype("str").str.encode(self._encoding)
27482748
# If larger than _max_string_length do nothing
27492749
if (
27502750
max_len_string_array(ensure_object(self.data[col]._values))

pandas/tests/dtypes/test_inference.py

Lines changed: 56 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
Generic,
2626
TypeVar,
2727
)
28+
import warnings
2829

2930
import numpy as np
3031
import pytest
@@ -134,58 +135,61 @@ def shape(self):
134135

135136
# collect all objects to be tested for list-like-ness; use tuples of objects,
136137
# whether they are list-like or not (special casing for sets), and their ID
137-
ll_params = [
138-
([1], True, "list"),
139-
([], True, "list-empty"),
140-
((1,), True, "tuple"),
141-
((), True, "tuple-empty"),
142-
({"a": 1}, True, "dict"),
143-
({}, True, "dict-empty"),
144-
({"a", 1}, "set", "set"),
145-
(set(), "set", "set-empty"),
146-
(frozenset({"a", 1}), "set", "frozenset"),
147-
(frozenset(), "set", "frozenset-empty"),
148-
(iter([1, 2]), True, "iterator"),
149-
(iter([]), True, "iterator-empty"),
150-
((x for x in [1, 2]), True, "generator"),
151-
((_ for _ in []), True, "generator-empty"),
152-
(Series([1]), True, "Series"),
153-
(Series([], dtype=object), True, "Series-empty"),
154-
# Series.str will still raise a TypeError if iterated
155-
(Series(["a"]).str, True, "StringMethods"),
156-
(Series([], dtype="O").str, True, "StringMethods-empty"),
157-
(Index([1]), True, "Index"),
158-
(Index([]), True, "Index-empty"),
159-
(DataFrame([[1]]), True, "DataFrame"),
160-
(DataFrame(), True, "DataFrame-empty"),
161-
(np.ndarray((2,) * 1), True, "ndarray-1d"),
162-
(np.array([]), True, "ndarray-1d-empty"),
163-
(np.ndarray((2,) * 2), True, "ndarray-2d"),
164-
(np.array([[]]), True, "ndarray-2d-empty"),
165-
(np.ndarray((2,) * 3), True, "ndarray-3d"),
166-
(np.array([[[]]]), True, "ndarray-3d-empty"),
167-
(np.ndarray((2,) * 4), True, "ndarray-4d"),
168-
(np.array([[[[]]]]), True, "ndarray-4d-empty"),
169-
(np.array(2), False, "ndarray-0d"),
170-
(MockNumpyLikeArray(np.ndarray((2,) * 1)), True, "duck-ndarray-1d"),
171-
(MockNumpyLikeArray(np.array([])), True, "duck-ndarray-1d-empty"),
172-
(MockNumpyLikeArray(np.ndarray((2,) * 2)), True, "duck-ndarray-2d"),
173-
(MockNumpyLikeArray(np.array([[]])), True, "duck-ndarray-2d-empty"),
174-
(MockNumpyLikeArray(np.ndarray((2,) * 3)), True, "duck-ndarray-3d"),
175-
(MockNumpyLikeArray(np.array([[[]]])), True, "duck-ndarray-3d-empty"),
176-
(MockNumpyLikeArray(np.ndarray((2,) * 4)), True, "duck-ndarray-4d"),
177-
(MockNumpyLikeArray(np.array([[[[]]]])), True, "duck-ndarray-4d-empty"),
178-
(MockNumpyLikeArray(np.array(2)), False, "duck-ndarray-0d"),
179-
(1, False, "int"),
180-
(b"123", False, "bytes"),
181-
(b"", False, "bytes-empty"),
182-
("123", False, "string"),
183-
("", False, "string-empty"),
184-
(str, False, "string-type"),
185-
(object(), False, "object"),
186-
(np.nan, False, "NaN"),
187-
(None, False, "None"),
188-
]
138+
with warnings.catch_warnings():
139+
# suppress warning on "StringMethods-empty" with object dtype
140+
warnings.filterwarnings("ignore", ".str accessor on object dtype is deprecated")
141+
ll_params = [
142+
([1], True, "list"),
143+
([], True, "list-empty"),
144+
((1,), True, "tuple"),
145+
((), True, "tuple-empty"),
146+
({"a": 1}, True, "dict"),
147+
({}, True, "dict-empty"),
148+
({"a", 1}, "set", "set"),
149+
(set(), "set", "set-empty"),
150+
(frozenset({"a", 1}), "set", "frozenset"),
151+
(frozenset(), "set", "frozenset-empty"),
152+
(iter([1, 2]), True, "iterator"),
153+
(iter([]), True, "iterator-empty"),
154+
((x for x in [1, 2]), True, "generator"),
155+
((_ for _ in []), True, "generator-empty"),
156+
(Series([1]), True, "Series"),
157+
(Series([], dtype=object), True, "Series-empty"),
158+
# Series.str will still raise a TypeError if iterated
159+
(Series(["a"]).str, True, "StringMethods"),
160+
(Series([], dtype="O").str, True, "StringMethods-empty"),
161+
(Index([1]), True, "Index"),
162+
(Index([]), True, "Index-empty"),
163+
(DataFrame([[1]]), True, "DataFrame"),
164+
(DataFrame(), True, "DataFrame-empty"),
165+
(np.ndarray((2,) * 1), True, "ndarray-1d"),
166+
(np.array([]), True, "ndarray-1d-empty"),
167+
(np.ndarray((2,) * 2), True, "ndarray-2d"),
168+
(np.array([[]]), True, "ndarray-2d-empty"),
169+
(np.ndarray((2,) * 3), True, "ndarray-3d"),
170+
(np.array([[[]]]), True, "ndarray-3d-empty"),
171+
(np.ndarray((2,) * 4), True, "ndarray-4d"),
172+
(np.array([[[[]]]]), True, "ndarray-4d-empty"),
173+
(np.array(2), False, "ndarray-0d"),
174+
(MockNumpyLikeArray(np.ndarray((2,) * 1)), True, "duck-ndarray-1d"),
175+
(MockNumpyLikeArray(np.array([])), True, "duck-ndarray-1d-empty"),
176+
(MockNumpyLikeArray(np.ndarray((2,) * 2)), True, "duck-ndarray-2d"),
177+
(MockNumpyLikeArray(np.array([[]])), True, "duck-ndarray-2d-empty"),
178+
(MockNumpyLikeArray(np.ndarray((2,) * 3)), True, "duck-ndarray-3d"),
179+
(MockNumpyLikeArray(np.array([[[]]])), True, "duck-ndarray-3d-empty"),
180+
(MockNumpyLikeArray(np.ndarray((2,) * 4)), True, "duck-ndarray-4d"),
181+
(MockNumpyLikeArray(np.array([[[[]]]])), True, "duck-ndarray-4d-empty"),
182+
(MockNumpyLikeArray(np.array(2)), False, "duck-ndarray-0d"),
183+
(1, False, "int"),
184+
(b"123", False, "bytes"),
185+
(b"", False, "bytes-empty"),
186+
("123", False, "string"),
187+
("", False, "string-empty"),
188+
(str, False, "string-type"),
189+
(object(), False, "object"),
190+
(np.nan, False, "NaN"),
191+
(None, False, "None"),
192+
]
189193
objs, expected, ids = zip(*ll_params)
190194

191195

pandas/tests/groupby/methods/test_value_counts.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,8 +347,12 @@ def test_against_frame_and_seriesgroupby(
347347
expected.name = name
348348
if as_index:
349349
index_frame = expected.index.to_frame(index=False)
350-
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
351-
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
350+
index_frame["gender"] = (
351+
index_frame["both"].astype(str).str.split("-").map(lambda x: x[0])
352+
)
353+
index_frame["education"] = (
354+
index_frame["both"].astype(str).str.split("-").map(lambda x: x[1])
355+
)
352356
del index_frame["both"]
353357
index_frame2 = index_frame.rename({0: None}, axis=1)
354358
expected.index = MultiIndex.from_frame(index_frame2)
@@ -360,8 +364,16 @@ def test_against_frame_and_seriesgroupby(
360364
expected.index.names = [None] + expected.index.names[1:]
361365
tm.assert_series_equal(result, expected)
362366
else:
363-
expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
364-
expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
367+
expected.insert(
368+
1,
369+
"gender",
370+
expected["both"].astype(str).str.split("-").map(lambda x: x[0]),
371+
)
372+
expected.insert(
373+
2,
374+
"education",
375+
expected["both"].astype(str).str.split("-").map(lambda x: x[1]),
376+
)
365377
if using_infer_string:
366378
expected = expected.astype({"gender": "str", "education": "str"})
367379
del expected["both"]

pandas/tests/io/sas/test_sas7bdat.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ def test_iterator_read_too_much(self, dirpath):
109109
tm.assert_frame_equal(d1, d2)
110110

111111

112+
@pytest.mark.filterwarnings(
113+
"ignore:.str accessor on object dtype is deprecated:FutureWarning"
114+
)
112115
def test_encoding_options(datapath):
113116
fname = datapath("io", "sas", "data", "test1.sas7bdat")
114117
df1 = pd.read_sas(fname)

pandas/tests/series/test_api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ def test_attrs(self):
160160
result = s + 1
161161
assert result.attrs == {"version": 1}
162162

163+
@pytest.mark.filterwarnings("ignore:.str accessor:FutureWarning")
163164
def test_inspect_getmembers(self):
164165
# GH38782
165166
ser = Series(dtype=object)

0 commit comments

Comments
 (0)