Skip to content

Commit 840e62a

Browse files
committed
fix ArrowDtype.itemsize
1 parent 1863adb commit 840e62a

File tree

2 files changed

+117
-2
lines changed

2 files changed

+117
-2
lines changed

pandas/core/dtypes/dtypes.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2308,8 +2308,34 @@ def kind(self) -> str:
23082308

23092309
@cache_readonly
23102310
def itemsize(self) -> int:
2311-
"""Return the number of bytes in this dtype"""
2312-
return self.numpy_dtype.itemsize
2311+
"""
2312+
Return the number of bytes in this dtype.
2313+
2314+
For Arrow-backed dtypes:
2315+
- Returns the fixed-width bit size divided by 8 for standard fixed-width types.
2316+
- For boolean types, returns the NumPy itemsize.
2317+
- Falls back to the NumPy dtype itemsize for variable-width or unsupported types.
2318+
2319+
Examples
2320+
--------
2321+
>>> import pyarrow as pa
2322+
>>> import pandas as pd
2323+
>>> dtype = pd.ArrowDtype(pa.int32())
2324+
>>> dtype.itemsize
2325+
4
2326+
2327+
>>> dtype = pd.ArrowDtype(pa.bool_())
2328+
>>> dtype.itemsize # falls back to numpy dtype
2329+
1
2330+
"""
2331+
# Use pyarrow itemsize for fixed-width data types
2332+
# e.g. int32 -> 32 bits // 8 = 4 bytes
2333+
try:
2334+
if pa.types.is_boolean(self.pyarrow_dtype):
2335+
return self.numpy_dtype.itemsize
2336+
return self.pyarrow_dtype.bit_width // 8
2337+
except (ValueError, AttributeError):
2338+
return self.numpy_dtype.itemsize
23132339

23142340
def construct_array_type(self) -> type_t[ArrowExtensionArray]:
23152341
"""

pandas/tests/dtypes/test_dtypes.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1259,3 +1259,92 @@ def test_categorical_nan_no_dtype_conversion():
12591259
expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]})
12601260
df.loc[0, "a"] = np.array([1])
12611261
tm.assert_frame_equal(df, expected)
1262+
1263+
1264+
@pytest.fixture
1265+
def pa():
1266+
return pytest.importorskip("pyarrow")
1267+
1268+
1269+
@pytest.mark.parametrize(
1270+
"type_name, expected_size",
1271+
[
1272+
# Integer types
1273+
("int8", 1),
1274+
("int16", 2),
1275+
("int32", 4),
1276+
("int64", 8),
1277+
("uint8", 1),
1278+
("uint16", 2),
1279+
("uint32", 4),
1280+
("uint64", 8),
1281+
# Floating point types
1282+
("float16", 2),
1283+
("float32", 4),
1284+
("float64", 8),
1285+
# Boolean
1286+
("bool_", 1),
1287+
# Date and timestamp types
1288+
("date32", 4),
1289+
("date64", 8),
1290+
("timestamp", 8),
1291+
# Time types
1292+
("time32", 4),
1293+
("time64", 8),
1294+
# Decimal types
1295+
("decimal128", 16),
1296+
("decimal256", 32),
1297+
],
1298+
) # type: ignore[misc]
1299+
def test_arrow_dtype_itemsize_fixed_width(pa, type_name, expected_size):
1300+
# GH 57948
1301+
1302+
parametric_type_map = {
1303+
"timestamp": pa.timestamp("ns"),
1304+
"time32": pa.time32("s"),
1305+
"time64": pa.time64("ns"),
1306+
"decimal128": pa.decimal128(38, 10),
1307+
"decimal256": pa.decimal256(76, 10),
1308+
}
1309+
1310+
if type_name in parametric_type_map:
1311+
arrow_type = parametric_type_map.get(type_name)
1312+
else:
1313+
arrow_type = getattr(pa, type_name)()
1314+
dtype = pd.ArrowDtype(arrow_type)
1315+
1316+
if type_name == "bool_":
1317+
expected_size = dtype.numpy_dtype.itemsize
1318+
1319+
assert dtype.itemsize == expected_size, (
1320+
f"{type_name} expected {expected_size}, got {dtype.itemsize} "
1321+
f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})"
1322+
)
1323+
1324+
1325+
@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
1326+
def test_arrow_dtype_itemsize_variable_width(pa, type_name):
1327+
# GH 57948
1328+
1329+
arrow_type = getattr(pa, type_name)()
1330+
dtype = pd.ArrowDtype(arrow_type)
1331+
1332+
assert dtype.itemsize == dtype.numpy_dtype.itemsize
1333+
1334+
1335+
def test_arrow_dtype_error_fallback(pa, monkeypatch):
1336+
# GH 57948
1337+
1338+
dtype = pd.ArrowDtype(pa.int32())
1339+
1340+
class ErrorType:
1341+
id = None
1342+
@property
1343+
def bit_width(self):
1344+
raise ValueError("Simulated Error")
1345+
1346+
def to_pandas_dtype(self):
1347+
return pd.Series([0]).dtype
1348+
1349+
monkeypatch.setattr(dtype, "pyarrow_dtype", ErrorType())
1350+
assert dtype.itemsize == dtype.numpy_dtype.itemsize

0 commit comments

Comments
 (0)