Skip to content

Commit 609039f

Browse files
mraabhijitdependabot[bot]VYaswanthKumarNavya1707krishna-datta
authored andcommitted
FIX: itemsize wrong for date32[day][pyarrow] dtype #57948 (#62657)
Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Yaswanth Kumar <155723049+VYaswanthKumar@users.noreply.github.com> Co-authored-by: Navya Srivastava <143343265+Navya1707@users.noreply.github.com> Co-authored-by: krishna datta <19500807+krishna-datta@users.noreply.github.com> Co-authored-by: ZA1815 <zaahme18@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akashisang <151737560+Akashisang@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: BreezeLune <1066773178@qq.com> Co-authored-by: jbrockmendel <jbrockmendel@gmail.com> Co-authored-by: Aokizy2 <3441854632@qq.com> Co-authored-by: aokizy <14817191+aokizy2@user.noreply.gitee.com> Co-authored-by: Sumeet Bhatnagar <69593471+nemo-1999@users.noreply.github.com>
1 parent b477b87 commit 609039f

File tree

4 files changed

+131
-4
lines changed

4 files changed

+131
-4
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,7 @@ Numeric
10011001
- Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`)
10021002
- Bug in :meth:`Series.std` and :meth:`Series.var` when using complex-valued data (:issue:`61645`)
10031003
- Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
1004+
- Bug in arithmetic operations between objects with numpy-nullable dtype and :class:`ArrowDtype` incorrectly raising (:issue:`58602`)
10041005

10051006
Conversion
10061007
^^^^^^^^^^

pandas/core/arrays/masked.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@
3737
is_string_dtype,
3838
pandas_dtype,
3939
)
40-
from pandas.core.dtypes.dtypes import BaseMaskedDtype
40+
from pandas.core.dtypes.dtypes import (
41+
ArrowDtype,
42+
BaseMaskedDtype,
43+
)
4144
from pandas.core.dtypes.missing import (
4245
array_equivalent,
4346
is_valid_na_for_dtype,
@@ -767,6 +770,10 @@ def _arith_method(self, other, op):
767770
pd_op = ops.get_array_op(op)
768771
other = ensure_wrapped_if_datetimelike(other)
769772

773+
if isinstance(other, ExtensionArray) and isinstance(other.dtype, ArrowDtype):
774+
# GH#58602
775+
return NotImplemented
776+
770777
if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
771778
# Avoid DeprecationWarning: In future, it will be an error
772779
# for 'np.bool_' scalars to be interpreted as an index
@@ -843,7 +850,11 @@ def _cmp_method(self, other, op) -> BooleanArray:
843850

844851
mask = None
845852

846-
if isinstance(other, BaseMaskedArray):
853+
if isinstance(other, ExtensionArray) and isinstance(other.dtype, ArrowDtype):
854+
# GH#58602
855+
return NotImplemented
856+
857+
elif isinstance(other, BaseMaskedArray):
847858
other, mask = other._data, other._mask
848859

849860
elif is_list_like(other):

pandas/core/dtypes/dtypes.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2308,8 +2308,35 @@ def kind(self) -> str:
23082308

23092309
@cache_readonly
23102310
def itemsize(self) -> int:
2311-
"""Return the number of bytes in this dtype"""
2312-
return self.numpy_dtype.itemsize
2311+
"""
2312+
Return the number of bytes in this dtype.
2313+
2314+
For Arrow-backed dtypes:
2315+
- Returns the fixed-width bit size divided by 8 for standard fixed-width types.
2316+
- For boolean types, returns the NumPy itemsize.
2317+
- Falls back to the NumPy dtype itemsize for variable-width & unsupported types.
2318+
2319+
Examples
2320+
--------
2321+
>>> import pyarrow as pa
2322+
>>> import pandas as pd
2323+
>>> dtype = pd.ArrowDtype(pa.int32())
2324+
>>> dtype.itemsize
2325+
4
2326+
2327+
>>> dtype = pd.ArrowDtype(pa.bool_())
2328+
>>> dtype.itemsize # falls back to numpy dtype
2329+
1
2330+
"""
2331+
if pa.types.is_boolean(self.pyarrow_dtype):
2332+
return self.numpy_dtype.itemsize
2333+
2334+
# Use pyarrow itemsize for fixed-width data types
2335+
# e.g. int32 -> 32 bits // 8 = 4 bytes
2336+
try:
2337+
return self.pyarrow_dtype.bit_width // 8
2338+
except (ValueError, AttributeError, NotImplementedError):
2339+
return self.numpy_dtype.itemsize
23132340

23142341
def construct_array_type(self) -> type_t[ArrowExtensionArray]:
23152342
"""

pandas/tests/extension/test_arrow.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3702,6 +3702,94 @@ def test_pow_with_all_na_float():
37023702
tm.assert_series_equal(result, expected)
37033703

37043704

3705+
def test_mul_numpy_nullable_with_pyarrow_float():
3706+
# GH#58602
3707+
left = pd.Series(range(5), dtype="Float64")
3708+
right = pd.Series(range(5), dtype="float64[pyarrow]")
3709+
3710+
expected = pd.Series([0, 1, 4, 9, 16], dtype="float64[pyarrow]")
3711+
3712+
result = left * right
3713+
tm.assert_series_equal(result, expected)
3714+
3715+
result2 = right * left
3716+
tm.assert_series_equal(result2, expected)
3717+
3718+
# while we're here, let's check __eq__
3719+
result3 = left == right
3720+
expected3 = pd.Series([True] * 5, dtype="bool[pyarrow]")
3721+
tm.assert_series_equal(result3, expected3)
3722+
3723+
result4 = right == left
3724+
tm.assert_series_equal(result4, expected3)
3725+
3726+
3727+
@pytest.mark.parametrize(
3728+
"type_name, expected_size",
3729+
[
3730+
# Integer types
3731+
("int8", 1),
3732+
("int16", 2),
3733+
("int32", 4),
3734+
("int64", 8),
3735+
("uint8", 1),
3736+
("uint16", 2),
3737+
("uint32", 4),
3738+
("uint64", 8),
3739+
# Floating point types
3740+
("float16", 2),
3741+
("float32", 4),
3742+
("float64", 8),
3743+
# Boolean
3744+
("bool_", 1),
3745+
# Date and timestamp types
3746+
("date32", 4),
3747+
("date64", 8),
3748+
("timestamp", 8),
3749+
# Time types
3750+
("time32", 4),
3751+
("time64", 8),
3752+
# Decimal types
3753+
("decimal128", 16),
3754+
("decimal256", 32),
3755+
],
3756+
)
3757+
def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size):
3758+
# GH 57948
3759+
3760+
parametric_type_map = {
3761+
"timestamp": pa.timestamp("ns"),
3762+
"time32": pa.time32("s"),
3763+
"time64": pa.time64("ns"),
3764+
"decimal128": pa.decimal128(38, 10),
3765+
"decimal256": pa.decimal256(76, 10),
3766+
}
3767+
3768+
if type_name in parametric_type_map:
3769+
arrow_type = parametric_type_map.get(type_name)
3770+
else:
3771+
arrow_type = getattr(pa, type_name)()
3772+
dtype = ArrowDtype(arrow_type)
3773+
3774+
if type_name == "bool_":
3775+
expected_size = dtype.numpy_dtype.itemsize
3776+
3777+
assert dtype.itemsize == expected_size, (
3778+
f"{type_name} expected {expected_size}, got {dtype.itemsize} "
3779+
f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})"
3780+
)
3781+
3782+
3783+
@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
3784+
def test_arrow_dtype_itemsize_variable_width(type_name):
3785+
# GH 57948
3786+
3787+
arrow_type = getattr(pa, type_name)()
3788+
dtype = ArrowDtype(arrow_type)
3789+
3790+
assert dtype.itemsize == dtype.numpy_dtype.itemsize
3791+
3792+
37053793
def test_cast_pontwise_result_decimal_nan():
37063794
# GH#62522 we don't want to get back null[pyarrow] here
37073795
ser = pd.Series([], dtype="float64[pyarrow]")

0 commit comments

Comments
 (0)