From 7c3aa3b9c44a3643090fe39c6c39857b081c3eda Mon Sep 17 00:00:00 2001 From: Lzforevr Date: Wed, 2 Oct 2024 21:53:48 +0800 Subject: [PATCH 1/3] Enhanced numeric.py to process hexadecimal,decimal,binary formats like 0x,0o,0b --- pandas/core/tools/numeric.py | 63 ++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 982851d0557c3..391bf28d62bbf 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -42,11 +42,23 @@ ) +def parse_numeric(value): + if isinstance(value, str): + try: + return int(value, 0) # Automatically detect radix + except ValueError: + try: + return float(value) + except ValueError: + return libmissing.NA + return value + + def to_numeric( - arg, - errors: DateTimeErrorChoices = "raise", - downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, - dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + arg, + errors: DateTimeErrorChoices = "raise", + downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ): """ Convert argument to a numeric type. @@ -214,25 +226,33 @@ def to_numeric( values = values.view(np.int64) else: values = ensure_object(values) - coerce_numeric = errors != "raise" - values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] - values, - set(), - coerce_numeric=coerce_numeric, - convert_to_masked_nullable=dtype_backend is not lib.no_default - or isinstance(values_dtype, StringDtype) - and values_dtype.na_value is libmissing.NA, - ) + parsed_values = [] + new_mask = [] + for idx, x in enumerate(values): + parsed_value = parse_numeric(x) + if libmissing.checknull(parsed_values): + if errors == 'raise': + raise ValueError(f"Unable to parse string '{x}' at position{idx}") + elif errors == 'coerce': + parsed_values.append(libmissing.NA) + new_mask.append(True) + continue + else: + parsed_values.append(parsed_value) + new_mask.append(False) + + values = np.array(parsed_values, dtype=object) + new_mask = np.array(new_mask, dtype=bool) if new_mask is not None: # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] elif ( - dtype_backend is not lib.no_default - and new_mask is None - or isinstance(values_dtype, StringDtype) - and values_dtype.na_value is libmissing.NA + dtype_backend is not lib.no_default + and new_mask is None + or isinstance(values_dtype, StringDtype) + and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) @@ -309,3 +329,12 @@ def to_numeric( return values[0] else: return values + + +if __name__ == "__main__": + import numpy as np + + test_data = ['0x1A', '0b1010', '0o17', '25', '3.14', 'invalid'] + result = to_numeric(test_data, errors='coerce') + print("Inputs:", test_data) + print("ParseResult:", result) From ec07f931a1034826b4ab7f2f60e2c01d2b880609 Mon Sep 17 00:00:00 2001 From: Lzforevr Date: Wed, 2 Oct 2024 23:23:07 +0800 Subject: [PATCH 2/3] Enhanced numeric.py to process hexadecimal,decimal,binary formats like 0x,0o,0b --- pandas/core/tools/numeric.py | 144 +++-------------------------------- 1 file changed, 10 insertions(+), 134 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 391bf28d62bbf..b744225a0e33a 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -62,116 +62,7 @@ def to_numeric( ): """ Convert argument to a numeric type. - - The default return dtype is `float64` or `int64` - depending on the data supplied. Use the `downcast` parameter - to obtain other dtypes. - - Please note that precision loss may occur if really large numbers - are passed in. Due to the internal limitations of `ndarray`, if - numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) - or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are - passed in, it is very likely they will be converted to float so that - they can be stored in an `ndarray`. These warnings apply similarly to - `Series` since it internally leverages `ndarray`. - - Parameters - ---------- - arg : scalar, list, tuple, 1-d array, or Series - Argument to be converted. - - errors : {'raise', 'coerce'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception. - - If 'coerce', then invalid parsing will be set as NaN. - - downcast : str, default None - Can be 'integer', 'signed', 'unsigned', or 'float'. - If not None, and if the data has been successfully cast to a - numerical dtype (or if the data was numeric to begin with), - downcast that resulting data to the smallest numerical dtype - possible according to the following rules: - - - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - - 'float': smallest float dtype (min.: np.float32) - - As this behaviour is separate from the core conversion to - numeric values, any errors raised during the downcasting - will be surfaced regardless of the value of the 'errors' input. - - In addition, downcasting will only occur if the size - of the resulting data's dtype is strictly larger than - the dtype it is to be cast to, so if none of the dtypes - checked satisfy that specification, no downcasting will be - performed on the data. - - dtype_backend : {'numpy_nullable', 'pyarrow'} - Back-end data type applied to the resultant :class:`DataFrame` - (still experimental). If not specified, the default behavior - is to not use nullable data types. If specified, the behavior - is as follows: - - * ``"numpy_nullable"``: returns nullable-dtype-backed object - * ``"pyarrow"``: returns with pyarrow-backed nullable object - - .. versionadded:: 2.0 - - Returns - ------- - ret - Numeric if parsing succeeded. - Return type depends on input. Series if Series, otherwise ndarray. - - See Also - -------- - DataFrame.astype : Cast argument to a specified dtype. - to_datetime : Convert argument to datetime. - to_timedelta : Convert argument to timedelta. - numpy.ndarray.astype : Cast a numpy array to a specified type. - DataFrame.convert_dtypes : Convert dtypes. - - Examples - -------- - Take separate series and convert to numeric, coercing when told to - - >>> s = pd.Series(["1.0", "2", -3]) - >>> pd.to_numeric(s) - 0 1.0 - 1 2.0 - 2 -3.0 - dtype: float64 - >>> pd.to_numeric(s, downcast="float") - 0 1.0 - 1 2.0 - 2 -3.0 - dtype: float32 - >>> pd.to_numeric(s, downcast="signed") - 0 1 - 1 2 - 2 -3 - dtype: int8 - >>> s = pd.Series(["apple", "1.0", "2", -3]) - >>> pd.to_numeric(s, errors="coerce") - 0 NaN - 1 1.0 - 2 2.0 - 3 -3.0 - dtype: float64 - - Downcasting of nullable integer and floating dtypes is supported: - - >>> s = pd.Series([1, 2, 3], dtype="Int64") - >>> pd.to_numeric(s, downcast="integer") - 0 1 - 1 2 - 2 3 - dtype: Int8 - >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64") - >>> pd.to_numeric(s, downcast="float") - 0 1.0 - 1 2.1 - 2 3.0 - dtype: Float32 + ... """ if downcast not in (None, "integer", "signed", "unsigned", "float"): raise ValueError("invalid downcasting method provided") @@ -208,8 +99,6 @@ def to_numeric( else: values = arg - # GH33013: for IntegerArray & FloatingArray extract non-null values for casting - # save mask to reconstruct the full array after casting mask: npt.NDArray[np.bool_] | None = None if isinstance(values, BaseMaskedArray): mask = values._mask @@ -220,6 +109,7 @@ def to_numeric( mask = values.isna() values = values.dropna().to_numpy() new_mask: np.ndarray | None = None + if is_numeric_dtype(values_dtype): pass elif lib.is_np_dtype(values_dtype, "mM"): @@ -231,9 +121,9 @@ def to_numeric( for idx, x in enumerate(values): parsed_value = parse_numeric(x) if libmissing.checknull(parsed_values): - if errors == 'raise': - raise ValueError(f"Unable to parse string '{x}' at position{idx}") - elif errors == 'coerce': + if errors == "raise": + raise ValueError(f"Unable to parse string '{x}' at position {idx}") + elif errors == "coerce": parsed_values.append(libmissing.NA) new_mask.append(True) continue @@ -245,8 +135,6 @@ def to_numeric( new_mask = np.array(new_mask, dtype=bool) if new_mask is not None: - # Remove unnecessary values, is expected later anyway and enables - # downcasting values = values[~new_mask] elif ( dtype_backend is not lib.no_default @@ -256,8 +144,6 @@ def to_numeric( ): new_mask = np.zeros(values.shape, dtype=np.bool_) - # attempt downcast only if the data has been successfully converted - # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): typecodes: str | None = None @@ -267,30 +153,23 @@ def to_numeric( typecodes = np.typecodes["UnsignedInteger"] elif downcast == "float": typecodes = np.typecodes["Float"] - - # pandas support goes only to np.float32, - # as float dtypes smaller than that are - # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: - # from smallest to largest for typecode in typecodes: dtype = np.dtype(typecode) if dtype.itemsize <= values.dtype.itemsize: + # Only downcast if values are all integers + if downcast in ("integer", "signed", "unsigned") and not np.isin(np.mod(values, 1), 0).all(): + continue # Skip downcasting if there are any float values values = maybe_downcast_numeric(values, dtype) - - # successful conversion if values.dtype == dtype: break - # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct - # masked array if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): if mask is None or (new_mask is not None and new_mask.shape == mask.shape): - # GH 52588 mask = new_mask else: mask = mask.copy() @@ -320,10 +199,7 @@ def to_numeric( if is_series: return arg._constructor(values, index=arg.index, name=arg.name) elif is_index: - # because we want to coerce to numeric if possible, - # do not use _shallow_copy from pandas import Index - return Index(values, name=arg.name) elif is_scalars: return values[0] @@ -334,7 +210,7 @@ def to_numeric( if __name__ == "__main__": import numpy as np - test_data = ['0x1A', '0b1010', '0o17', '25', '3.14', 'invalid'] - result = to_numeric(test_data, errors='coerce') + test_data = ["0x1A", "0b1010", "0o17", "25", "3.14", "invalid"] + result = to_numeric(test_data, errors="coerce") print("Inputs:", test_data) print("ParseResult:", result) From e618573c9891965f1f4b1633050e71fce61c4a67 Mon Sep 17 00:00:00 2001 From: Lzforevr Date: Wed, 2 Oct 2024 23:28:40 +0800 Subject: [PATCH 3/3] Enhanced numeric.py to process hexadecimal,decimal,binary formats like 0x,0o,0b --- pandas/core/tools/numeric.py | 145 +++++++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 15 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index b744225a0e33a..b8fa0c6381638 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -62,7 +62,116 @@ def to_numeric( ): """ Convert argument to a numeric type. - ... + + The default return dtype is `float64` or `int64` + depending on the data supplied. Use the `downcast` parameter + to obtain other dtypes. + + Please note that precision loss may occur if really large numbers + are passed in. Due to the internal limitations of `ndarray`, if + numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) + or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are + passed in, it is very likely they will be converted to float so that + they can be stored in an `ndarray`. These warnings apply similarly to + `Series` since it internally leverages `ndarray`. + + Parameters + ---------- + arg : scalar, list, tuple, 1-d array, or Series + Argument to be converted. + + errors : {'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaN. + + downcast : str, default None + Can be 'integer', 'signed', 'unsigned', or 'float'. + If not None, and if the data has been successfully cast to a + numerical dtype (or if the data was numeric to begin with), + downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + As this behaviour is separate from the core conversion to + numeric values, any errors raised during the downcasting + will be surfaced regardless of the value of the 'errors' input. + + In addition, downcasting will only occur if the size + of the resulting data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + + dtype_backend : {'numpy_nullable', 'pyarrow'} + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). If not specified, the default behavior + is to not use nullable data types. If specified, the behavior + is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed object + * ``"pyarrow"``: returns with pyarrow-backed nullable object + + .. versionadded:: 2.0 + + Returns + ------- + ret + Numeric if parsing succeeded. + Return type depends on input. Series if Series, otherwise ndarray. + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + numpy.ndarray.astype : Cast a numpy array to a specified type. + DataFrame.convert_dtypes : Convert dtypes. + + Examples + -------- + Take separate series and convert to numeric, coercing when told to + + >>> s = pd.Series(["1.0", "2", -3]) + >>> pd.to_numeric(s) + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float64 + >>> pd.to_numeric(s, downcast="float") + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float32 + >>> pd.to_numeric(s, downcast="signed") + 0 1 + 1 2 + 2 -3 + dtype: int8 + >>> s = pd.Series(["apple", "1.0", "2", -3]) + >>> pd.to_numeric(s, errors="coerce") + 0 NaN + 1 1.0 + 2 2.0 + 3 -3.0 + dtype: float64 + + Downcasting of nullable integer and floating dtypes is supported: + + >>> s = pd.Series([1, 2, 3], dtype="Int64") + >>> pd.to_numeric(s, downcast="integer") + 0 1 + 1 2 + 2 3 + dtype: Int8 + >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64") + >>> pd.to_numeric(s, downcast="float") + 0 1.0 + 1 2.1 + 2 3.0 + dtype: Float32 """ if downcast not in (None, "integer", "signed", "unsigned", "float"): raise ValueError("invalid downcasting method provided") @@ -99,6 +208,8 @@ def to_numeric( else: values = arg + # GH33013: for IntegerArray & FloatingArray extract non-null values for casting + # save mask to reconstruct the full array after casting mask: npt.NDArray[np.bool_] | None = None if isinstance(values, BaseMaskedArray): mask = values._mask @@ -109,7 +220,6 @@ def to_numeric( mask = values.isna() values = values.dropna().to_numpy() new_mask: np.ndarray | None = None - if is_numeric_dtype(values_dtype): pass elif lib.is_np_dtype(values_dtype, "mM"): @@ -122,7 +232,7 @@ def to_numeric( parsed_value = parse_numeric(x) if libmissing.checknull(parsed_values): if errors == "raise": - raise ValueError(f"Unable to parse string '{x}' at position {idx}") + raise ValueError(f"Unable to parse string '{x}' at position{idx}") elif errors == "coerce": parsed_values.append(libmissing.NA) new_mask.append(True) @@ -135,6 +245,8 @@ def to_numeric( new_mask = np.array(new_mask, dtype=bool) if new_mask is not None: + # Remove unnecessary values, is expected later anyway and enables + # downcasting values = values[~new_mask] elif ( dtype_backend is not lib.no_default @@ -144,6 +256,8 @@ def to_numeric( ): new_mask = np.zeros(values.shape, dtype=np.bool_) + # attempt downcast only if the data has been successfully converted + # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): typecodes: str | None = None @@ -153,23 +267,30 @@ def to_numeric( typecodes = np.typecodes["UnsignedInteger"] elif downcast == "float": typecodes = np.typecodes["Float"] + + # pandas support goes only to np.float32, + # as float dtypes smaller than that are + # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: + # from smallest to largest for typecode in typecodes: dtype = np.dtype(typecode) if dtype.itemsize <= values.dtype.itemsize: - # Only downcast if values are all integers - if downcast in ("integer", "signed", "unsigned") and not np.isin(np.mod(values, 1), 0).all(): - continue # Skip downcasting if there are any float values values = maybe_downcast_numeric(values, dtype) + + # successful conversion if values.dtype == dtype: break + # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct + # masked array if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): if mask is None or (new_mask is not None and new_mask.shape == mask.shape): + # GH 52588 mask = new_mask else: mask = mask.copy() @@ -199,18 +320,12 @@ def to_numeric( if is_series: return arg._constructor(values, index=arg.index, name=arg.name) elif is_index: + # because we want to coerce to numeric if possible, + # do not use _shallow_copy from pandas import Index + return Index(values, name=arg.name) elif is_scalars: return values[0] else: return values - - -if __name__ == "__main__": - import numpy as np - - test_data = ["0x1A", "0b1010", "0o17", "25", "3.14", "invalid"] - result = to_numeric(test_data, errors="coerce") - print("Inputs:", test_data) - print("ParseResult:", result)