Skip to content

Commit 7569baa

Browse files
committed
working not yet fixed for preserving type on a low level
1 parent 2fa5d5d commit 7569baa

File tree

9 files changed

+449
-71
lines changed

9 files changed

+449
-71
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,6 @@ doc/source/savefig/
141141
# Pyodide/WASM related files #
142142
##############################
143143
/.pyodide-xbuildenv-*
144+
145+
146+
.venv/

local_test.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Working test script that reproduces the exact failing test cases
4+
"""
5+
6+
import pandas as pd
7+
import pyarrow as pa
8+
from pandas.core.arrays import ArrowExtensionArray
9+
10+
def test_timestamp():
11+
print("=== Testing Timestamp Case ===")
12+
13+
# Create timestamp
14+
timestamps = pd.to_datetime(['2020-01-01 01:01:01.000001']).tz_localize('US/Eastern')
15+
16+
# Create with nanosecond precision like in the failing test
17+
arrow_dtype = pd.ArrowDtype(pa.timestamp('ns', tz='US/Eastern'))
18+
data_missing = ArrowExtensionArray._from_sequence([pd.NA, timestamps[0]], dtype=arrow_dtype)
19+
20+
print("Original array:")
21+
print(f" dtype: {data_missing.dtype}")
22+
print(f" pyarrow_dtype: {data_missing.dtype.pyarrow_dtype}")
23+
print(f" unit: {data_missing.dtype.pyarrow_dtype.unit}")
24+
print(f" timezone: {data_missing.dtype.pyarrow_dtype.tz}")
25+
print(f" values: {data_missing}")
26+
print()
27+
28+
# Test the map operation that's failing
29+
print("Testing map operation:")
30+
result = data_missing.map(lambda x: x, na_action='ignore')
31+
32+
print("Result array:")
33+
print(f" dtype: {result.dtype}")
34+
print(f" pyarrow_dtype: {result.dtype.pyarrow_dtype}")
35+
print(f" unit: {result.dtype.pyarrow_dtype.unit}")
36+
print(f" timezone: {result.dtype.pyarrow_dtype.tz}")
37+
print(f" values: {result}")
38+
print()
39+
40+
# Check if they're equal (this is what the test is checking)
41+
dtypes_equal = data_missing.dtype == result.dtype
42+
print(f"Timestamp dtypes equal: {dtypes_equal}")
43+
44+
if not dtypes_equal:
45+
print("❌ TIMESTAMP TEST WOULD FAIL!")
46+
print(f"Expected: {data_missing.dtype}")
47+
print(f"Got: {result.dtype}")
48+
else:
49+
print("✅ Timestamp test would pass!")
50+
51+
return dtypes_equal
52+
53+
def test_integer():
54+
print("\n=== Testing Integer Case ===")
55+
56+
# Create integer array like in the failing test
57+
int_dtype = pd.ArrowDtype(pa.int64())
58+
data_missing = ArrowExtensionArray._from_sequence([pd.NA, 1], dtype=int_dtype)
59+
60+
print("Original array:")
61+
print(f" dtype: {data_missing.dtype}")
62+
print(f" pyarrow_dtype: {data_missing.dtype.pyarrow_dtype}")
63+
print(f" values: {data_missing}")
64+
print(f" _pa_array.type: {data_missing._pa_array.type}")
65+
print()
66+
67+
# Test the map operation
68+
print("Testing map operation:")
69+
result = data_missing.map(lambda x: x, na_action='ignore')
70+
71+
print("Result array:")
72+
print(f" dtype: {result.dtype}")
73+
print(f" pyarrow_dtype: {result.dtype.pyarrow_dtype}")
74+
print(f" values: {result}")
75+
print()
76+
77+
# Check if they're equal
78+
dtypes_equal = data_missing.dtype == result.dtype
79+
print(f"Integer dtypes equal: {dtypes_equal}")
80+
81+
if not dtypes_equal:
82+
print("❌ INTEGER TEST WOULD FAIL!")
83+
print(f"Expected: {data_missing.dtype}")
84+
print(f"Got: {result.dtype}")
85+
else:
86+
print("✅ Integer test would pass!")
87+
88+
return dtypes_equal
89+
90+
def test_cast_pointwise_directly():
91+
print("\n=== Testing _cast_pointwise_result directly ===")
92+
93+
# Test with timestamp
94+
print("Testing timestamp cast:")
95+
timestamps = pd.to_datetime(['2020-01-01 01:01:01.000001']).tz_localize('US/Eastern')
96+
arrow_dtype_ns = pd.ArrowDtype(pa.timestamp('ns', tz='US/Eastern'))
97+
data_ns = ArrowExtensionArray._from_sequence([pd.NA, timestamps[0]], dtype=arrow_dtype_ns)
98+
99+
arrow_dtype_us = pd.ArrowDtype(pa.timestamp('us', tz='US/Eastern'))
100+
data_us = ArrowExtensionArray._from_sequence([pd.NA, timestamps[0]], dtype=arrow_dtype_us)
101+
102+
print(f"Original (ns): {data_ns.dtype}")
103+
print(f"Wrong (us): {data_us.dtype}")
104+
105+
try:
106+
fixed_result = data_ns._cast_pointwise_result(data_us)
107+
print(f"Fixed result: {fixed_result.dtype}")
108+
print(f"Timestamp fix works: {data_ns.dtype == fixed_result.dtype}")
109+
except Exception as e:
110+
print(f"Timestamp cast error: {e}")
111+
112+
# Test with integer
113+
print("\nTesting integer cast:")
114+
int_dtype = pd.ArrowDtype(pa.int64())
115+
data_int = ArrowExtensionArray._from_sequence([pd.NA, 1], dtype=int_dtype)
116+
117+
double_dtype = pd.ArrowDtype(pa.float64())
118+
data_double = ArrowExtensionArray._from_sequence([pd.NA, 1.0], dtype=double_dtype)
119+
120+
print(f"Original (int64): {data_int.dtype}")
121+
print(f"Wrong (double): {data_double.dtype}")
122+
123+
try:
124+
fixed_result = data_int._cast_pointwise_result(data_double)
125+
print(f"Fixed result: {fixed_result.dtype}")
126+
print(f"Integer fix works: {data_int.dtype == fixed_result.dtype}")
127+
except Exception as e:
128+
print(f"Integer cast error: {e}")
129+
130+
def debug_pa_array_creation():
131+
print("\n=== Debugging pa.array() behavior ===")
132+
133+
# Test what happens when we create pa.array from integer values
134+
values_int = [None, 1]
135+
values_float = [None, 1.0]
136+
137+
print("Testing pa.array with integer values:")
138+
arr_int = pa.array(values_int, from_pandas=True)
139+
print(f" Input: {values_int}")
140+
print(f" Result type: {arr_int.type}")
141+
142+
print("Testing pa.array with float values:")
143+
arr_float = pa.array(values_float, from_pandas=True)
144+
print(f" Input: {values_float}")
145+
print(f" Result type: {arr_float.type}")
146+
147+
# Test mixed values (this might be the issue)
148+
mixed_values = [pd.NA, 1]
149+
print("Testing pa.array with mixed NA/int values:")
150+
arr_mixed = pa.array(mixed_values, from_pandas=True)
151+
print(f" Input: {mixed_values}")
152+
print(f" Result type: {arr_mixed.type}")
153+
154+
if __name__ == "__main__":
155+
print("Testing Arrow dtype preservation issues...")
156+
print("=" * 60)
157+
158+
# Run all tests
159+
ts_pass = test_timestamp()
160+
int_pass = test_integer()
161+
test_cast_pointwise_directly()
162+
debug_pa_array_creation()
163+
164+
print("\n" + "=" * 60)
165+
print("SUMMARY:")
166+
print(f"Timestamp test: {'✅ PASS' if ts_pass else '❌ FAIL'}")
167+
print(f"Integer test: {'✅ PASS' if int_pass else '❌ FAIL'}")

pandas/_libs/lib.pyx

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2913,12 +2913,38 @@ def maybe_convert_objects(ndarray[object] objects,
29132913
result[mask] = 1
29142914
result = IntegerArray(result, mask)
29152915
elif result is floats and convert_to_nullable_dtype:
2916-
from pandas.core.arrays import FloatingArray
2917-
2918-
# Set these values to 1.0 to be deterministic, match
2919-
# FloatingDtype._internal_fill_value
2920-
result[mask] = 1.0
2921-
result = FloatingArray(result, mask)
2916+
# Try to preserve integer EAs: if all valid values are integer-like,
2917+
# downcast to an IntegerArray instead of FloatingArray.
2918+
cvals = result[~mask]
2919+
if cvals.size == 0:
2920+
all_int_like = True
2921+
saw_negative = False
2922+
else:
2923+
all_int_like = np.all(np.isfinite(cvals)) and np.all(cvals == np.trunc(cvals))
2924+
saw_negative = np.any(cvals < 0)
2925+
2926+
if all_int_like:
2927+
from pandas.core.arrays import IntegerArray
2928+
# choose signedness from data
2929+
signed = bool(saw_negative)
2930+
# choose itemsize: reuse largest seen, else default to 8 bytes
2931+
itemsize = itemsize_max if itemsize_max > 0 else 8
2932+
if itemsize not in (1, 2, 4, 8):
2933+
itemsize = 8
2934+
# build dtype code like 'i8' / 'u8'
2935+
code = ('i' if signed else 'u') + str(itemsize)
2936+
int_vals = cvals.astype(code, copy=False)
2937+
vals = np.empty(result.shape, dtype=int_vals.dtype)
2938+
vals[~mask] = int_vals
2939+
# match IntegerDtype._internal_fill_value deterministically
2940+
vals[mask] = 1
2941+
return IntegerArray(vals, mask)
2942+
else:
2943+
from pandas.core.arrays import FloatingArray
2944+
# Set these values to 1.0 to be deterministic, match
2945+
# FloatingDtype._internal_fill_value
2946+
result[mask] = 1.0
2947+
result = FloatingArray(result, mask)
29222948

29232949
if result is uints or result is ints or result is floats or result is complexes:
29242950
# cast to the largest itemsize when all values are NumPy scalars

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ from cpython.object cimport (
1717
)
1818

1919
import numpy as np
20+
import pyarrow as pa
2021

2122
cimport numpy as cnp
2223
from numpy cimport (
@@ -2311,8 +2312,8 @@ class Timedelta(_Timedelta):
23112312
# see also: item_from_zerodim
23122313
item = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other)
23132314
return self.__mul__(item)
2314-
return other * self.to_timedelta64()
2315-
2315+
result = other * self.to_timedelta64()
2316+
return type(self)(pa.array(result))
23162317
return NotImplemented
23172318

23182319
__rmul__ = __mul__
@@ -2323,6 +2324,8 @@ class Timedelta(_Timedelta):
23232324
other = Timedelta(other)
23242325
if other is NaT:
23252326
return np.nan
2327+
if other._value == 0:
2328+
return NaT # np.NaT attribute error
23262329
if other._creso != self._creso:
23272330
self, other = self._maybe_cast_to_matching_resos(other)
23282331
return self._value/ float(other._value)
@@ -2337,6 +2340,9 @@ class Timedelta(_Timedelta):
23372340
other = int(other)
23382341
if isinstance(other, cnp.floating):
23392342
other = float(other)
2343+
if isinstance(other, Timedelta): # ratio
2344+
return self._value / other._value
2345+
# scaling
23402346
return Timedelta._from_value_and_reso(
23412347
<int64_t>(self._value/ other), self._creso
23422348
)
@@ -2346,7 +2352,8 @@ class Timedelta(_Timedelta):
23462352
# see also: item_from_zerodim
23472353
item = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other)
23482354
return self.__truediv__(item)
2349-
return self.to_timedelta64() / other
2355+
result = self.to_timedelta64() / other
2356+
return type(self)(pa.array(result))
23502357

23512358
return NotImplemented
23522359

@@ -2372,8 +2379,8 @@ class Timedelta(_Timedelta):
23722379
# TODO: if other.dtype.kind == "m" and other.dtype != self.asm8.dtype
23732380
# then should disallow for consistency with scalar behavior; requires
23742381
# deprecation cycle. (or changing scalar behavior)
2375-
return other / self.to_timedelta64()
2376-
2382+
result = other / self.to_timedelta64()
2383+
return type(self)(pa.array(result))
23772384
return NotImplemented
23782385

23792386
def __floordiv__(self, other):
@@ -2426,7 +2433,7 @@ class Timedelta(_Timedelta):
24262433
return self // other.item()
24272434
else:
24282435
return self.to_timedelta64() // other
2429-
2436+
24302437
raise TypeError(f"Invalid dtype {other.dtype} for __floordiv__")
24312438

24322439
return NotImplemented

0 commit comments

Comments
 (0)