@@ -62,7 +62,116 @@ def to_numeric(
6262):
6363 """
6464 Convert argument to a numeric type.
65- ...
65+
66+ The default return dtype is `float64` or `int64`
67+ depending on the data supplied. Use the `downcast` parameter
68+ to obtain other dtypes.
69+
70+ Please note that precision loss may occur if really large numbers
71+ are passed in. Due to the internal limitations of `ndarray`, if
72+ numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
73+ or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
74+ passed in, it is very likely they will be converted to float so that
75+ they can be stored in an `ndarray`. These warnings apply similarly to
76+ `Series` since it internally leverages `ndarray`.
77+
78+ Parameters
79+ ----------
80+ arg : scalar, list, tuple, 1-d array, or Series
81+ Argument to be converted.
82+
83+ errors : {'raise', 'coerce'}, default 'raise'
84+ - If 'raise', then invalid parsing will raise an exception.
85+ - If 'coerce', then invalid parsing will be set as NaN.
86+
87+ downcast : str, default None
88+ Can be 'integer', 'signed', 'unsigned', or 'float'.
89+ If not None, and if the data has been successfully cast to a
90+ numerical dtype (or if the data was numeric to begin with),
91+ downcast that resulting data to the smallest numerical dtype
92+ possible according to the following rules:
93+
94+ - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
95+ - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
96+ - 'float': smallest float dtype (min.: np.float32)
97+
98+ As this behaviour is separate from the core conversion to
99+ numeric values, any errors raised during the downcasting
100+ will be surfaced regardless of the value of the 'errors' input.
101+
102+ In addition, downcasting will only occur if the size
103+ of the resulting data's dtype is strictly larger than
104+ the dtype it is to be cast to, so if none of the dtypes
105+ checked satisfy that specification, no downcasting will be
106+ performed on the data.
107+
108+ dtype_backend : {'numpy_nullable', 'pyarrow'}
109+ Back-end data type applied to the resultant :class:`DataFrame`
110+ (still experimental). If not specified, the default behavior
111+ is to not use nullable data types. If specified, the behavior
112+ is as follows:
113+
114+ * ``"numpy_nullable"``: returns nullable-dtype-backed object
115+ * ``"pyarrow"``: returns with pyarrow-backed nullable object
116+
117+ .. versionadded:: 2.0
118+
119+ Returns
120+ -------
121+ ret
122+ Numeric if parsing succeeded.
123+ Return type depends on input. Series if Series, otherwise ndarray.
124+
125+ See Also
126+ --------
127+ DataFrame.astype : Cast argument to a specified dtype.
128+ to_datetime : Convert argument to datetime.
129+ to_timedelta : Convert argument to timedelta.
130+ numpy.ndarray.astype : Cast a numpy array to a specified type.
131+ DataFrame.convert_dtypes : Convert dtypes.
132+
133+ Examples
134+ --------
135+ Take separate series and convert to numeric, coercing when told to
136+
137+ >>> s = pd.Series(["1.0", "2", -3])
138+ >>> pd.to_numeric(s)
139+ 0 1.0
140+ 1 2.0
141+ 2 -3.0
142+ dtype: float64
143+ >>> pd.to_numeric(s, downcast="float")
144+ 0 1.0
145+ 1 2.0
146+ 2 -3.0
147+ dtype: float32
148+ >>> pd.to_numeric(s, downcast="signed")
149+ 0 1
150+ 1 2
151+ 2 -3
152+ dtype: int8
153+ >>> s = pd.Series(["apple", "1.0", "2", -3])
154+ >>> pd.to_numeric(s, errors="coerce")
155+ 0 NaN
156+ 1 1.0
157+ 2 2.0
158+ 3 -3.0
159+ dtype: float64
160+
161+ Downcasting of nullable integer and floating dtypes is supported:
162+
163+ >>> s = pd.Series([1, 2, 3], dtype="Int64")
164+ >>> pd.to_numeric(s, downcast="integer")
165+ 0 1
166+ 1 2
167+ 2 3
168+ dtype: Int8
169+ >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
170+ >>> pd.to_numeric(s, downcast="float")
171+ 0 1.0
172+ 1 2.1
173+ 2 3.0
174+ dtype: Float32
66175 """
67176 if downcast not in (None , "integer" , "signed" , "unsigned" , "float" ):
68177 raise ValueError ("invalid downcasting method provided" )
@@ -99,6 +208,8 @@ def to_numeric(
99208 else :
100209 values = arg
101210
211+ # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
212+ # save mask to reconstruct the full array after casting
102213 mask : npt .NDArray [np .bool_ ] | None = None
103214 if isinstance (values , BaseMaskedArray ):
104215 mask = values ._mask
@@ -109,7 +220,6 @@ def to_numeric(
109220 mask = values .isna ()
110221 values = values .dropna ().to_numpy ()
111222 new_mask : np .ndarray | None = None
112-
113223 if is_numeric_dtype (values_dtype ):
114224 pass
115225 elif lib .is_np_dtype (values_dtype , "mM" ):
@@ -122,7 +232,7 @@ def to_numeric(
122232 parsed_value = parse_numeric (x )
123233 if libmissing .checknull (parsed_values ):
124234 if errors == "raise" :
125- raise ValueError (f"Unable to parse string '{ x } ' at position { idx } " )
235+ raise ValueError (f"Unable to parse string '{ x } ' at position{ idx } " )
126236 elif errors == "coerce" :
127237 parsed_values .append (libmissing .NA )
128238 new_mask .append (True )
@@ -135,6 +245,8 @@ def to_numeric(
135245 new_mask = np .array (new_mask , dtype = bool )
136246
137247 if new_mask is not None :
248+ # Remove unnecessary values, is expected later anyway and enables
249+ # downcasting
138250 values = values [~ new_mask ]
139251 elif (
140252 dtype_backend is not lib .no_default
@@ -144,6 +256,8 @@ def to_numeric(
144256 ):
145257 new_mask = np .zeros (values .shape , dtype = np .bool_ )
146258
259+ # attempt downcast only if the data has been successfully converted
260+ # to a numerical dtype and if a downcast method has been specified
147261 if downcast is not None and is_numeric_dtype (values .dtype ):
148262 typecodes : str | None = None
149263
@@ -153,23 +267,30 @@ def to_numeric(
153267 typecodes = np .typecodes ["UnsignedInteger" ]
154268 elif downcast == "float" :
155269 typecodes = np .typecodes ["Float" ]
270+
271+ # pandas support goes only to np.float32,
272+ # as float dtypes smaller than that are
273+ # extremely rare and not well supported
156274 float_32_char = np .dtype (np .float32 ).char
157275 float_32_ind = typecodes .index (float_32_char )
158276 typecodes = typecodes [float_32_ind :]
159277
160278 if typecodes is not None :
279+ # from smallest to largest
161280 for typecode in typecodes :
162281 dtype = np .dtype (typecode )
163282 if dtype .itemsize <= values .dtype .itemsize :
164- # Only downcast if values are all integers
165- if downcast in ("integer" , "signed" , "unsigned" ) and not np .isin (np .mod (values , 1 ), 0 ).all ():
166- continue # Skip downcasting if there are any float values
167283 values = maybe_downcast_numeric (values , dtype )
284+
285+ # successful conversion
168286 if values .dtype == dtype :
169287 break
170288
289+ # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
290+ # masked array
171291 if (mask is not None or new_mask is not None ) and not is_string_dtype (values .dtype ):
172292 if mask is None or (new_mask is not None and new_mask .shape == mask .shape ):
293+ # GH 52588
173294 mask = new_mask
174295 else :
175296 mask = mask .copy ()
@@ -199,18 +320,12 @@ def to_numeric(
199320 if is_series :
200321 return arg ._constructor (values , index = arg .index , name = arg .name )
201322 elif is_index :
323+ # because we want to coerce to numeric if possible,
324+ # do not use _shallow_copy
202325 from pandas import Index
326+
203327 return Index (values , name = arg .name )
204328 elif is_scalars :
205329 return values [0 ]
206330 else :
207331 return values
208-
209-
210- if __name__ == "__main__" :
211- import numpy as np
212-
213- test_data = ["0x1A" , "0b1010" , "0o17" , "25" , "3.14" , "invalid" ]
214- result = to_numeric (test_data , errors = "coerce" )
215- print ("Inputs:" , test_data )
216- print ("ParseResult:" , result )
0 commit comments