Skip to content

Commit aac7af7

Browse files
author
cloudboat
committed
feat: handle DST transitions in timezone-aware resampling
1 parent 88c276a commit aac7af7

File tree

2 files changed

+309
-34
lines changed

2 files changed

+309
-34
lines changed

pandas/core/resample.py

Lines changed: 69 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2576,42 +2576,80 @@ def _get_time_bins(self, ax: DatetimeIndex):
25762576
)
25772577

25782578
if len(ax) == 0:
2579-
binner = labels = DatetimeIndex(
2580-
data=[], freq=self.freq, name=ax.name, dtype=ax.dtype
2581-
)
2582-
return binner, [], labels
2579+
empty = DatetimeIndex(data=[], freq=self.freq, name=ax.name, dtype=ax.dtype)
2580+
return empty, [], empty
25832581

2584-
first, last = _get_timestamp_range_edges(
2585-
ax.min(),
2586-
ax.max(),
2587-
self.freq,
2588-
unit=ax.unit,
2589-
closed=self.closed,
2590-
origin=self.origin,
2591-
offset=self.offset,
2592-
)
2593-
# GH #12037
2594-
# use first/last directly instead of call replace() on them
2595-
# because replace() will swallow the nanosecond part
2596-
# thus last bin maybe slightly before the end if the end contains
2597-
# nanosecond part and lead to `Values falls after last bin` error
2598-
# GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback
2599-
# has noted that ambiguous=True provides the most sensible result
2600-
binner = labels = date_range(
2601-
freq=self.freq,
2602-
start=first,
2603-
end=last,
2604-
tz=ax.tz,
2605-
name=ax.name,
2606-
ambiguous=True,
2607-
nonexistent="shift_forward",
2608-
unit=ax.unit,
2609-
)
2582+
if ax.tz is not None:
2583+
try:
2584+
first, last = _get_timestamp_range_edges(
2585+
ax.min(),
2586+
ax.max(),
2587+
self.freq,
2588+
unit=ax.unit,
2589+
closed=self.closed,
2590+
origin=self.origin,
2591+
offset=self.offset,
2592+
)
2593+
binner = labels = date_range(
2594+
freq=self.freq,
2595+
start=first,
2596+
end=last,
2597+
tz=ax.tz,
2598+
name=ax.name,
2599+
ambiguous=True,
2600+
nonexistent="shift_forward",
2601+
unit=ax.unit,
2602+
)
2603+
except Exception as e:
2604+
if "nonexistent" not in str(e).lower():
2605+
raise
2606+
2607+
ax_utc = ax.tz_convert("UTC")
2608+
2609+
first_utc, last_utc = _get_timestamp_range_edges(
2610+
ax_utc.min(),
2611+
ax_utc.max(),
2612+
self.freq,
2613+
unit=ax.unit,
2614+
closed=self.closed,
2615+
origin=self.origin,
2616+
offset=self.offset,
2617+
)
2618+
2619+
binner_utc = date_range(
2620+
start=first_utc,
2621+
end=last_utc,
2622+
freq=self.freq,
2623+
tz="UTC",
2624+
name=ax.name,
2625+
unit=ax.unit,
2626+
)
2627+
2628+
binner = labels = binner_utc.tz_convert(ax.tz)
26102629

2630+
else:
2631+
first, last = _get_timestamp_range_edges(
2632+
ax.min(),
2633+
ax.max(),
2634+
self.freq,
2635+
unit=ax.unit,
2636+
closed=self.closed,
2637+
origin=self.origin,
2638+
offset=self.offset,
2639+
)
2640+
binner = labels = date_range(
2641+
freq=self.freq,
2642+
start=first,
2643+
end=last,
2644+
tz=ax.tz,
2645+
name=ax.name,
2646+
ambiguous=True,
2647+
nonexistent="shift_forward",
2648+
unit=ax.unit,
2649+
)
26112650
ax_values = ax.asi8
26122651
binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
26132652

2614-
# general version, knowing nothing about relative frequencies
26152653
bins = lib.generate_bins_dt64(
26162654
ax_values, bin_edges, self.closed, hasnans=ax.hasnans
26172655
)
@@ -2627,9 +2665,6 @@ def _get_time_bins(self, ax: DatetimeIndex):
26272665
binner = binner.insert(0, NaT)
26282666
labels = labels.insert(0, NaT)
26292667

2630-
# if we end up with more labels than bins
2631-
# adjust the labels
2632-
# GH4076
26332668
if len(bins) < len(labels):
26342669
labels = labels[: len(bins)]
26352670

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
import numpy as np
2+
import pytest
3+
4+
from pandas import (
5+
DataFrame,
6+
DatetimeIndex,
7+
)
8+
9+
10+
class TestResampleDSTAfricaCairo:
11+
"""DST transition tests for Africa/Cairo timezone."""
12+
13+
def test_resample_across_dst_transition(self):
14+
df = DataFrame(
15+
{"value": [1, 2]},
16+
index=DatetimeIndex(
17+
[
18+
"2024-04-26 01:00:00",
19+
"2024-04-27 00:00:00",
20+
]
21+
).tz_localize("Africa/Cairo", nonexistent="shift_forward"),
22+
)
23+
24+
result = df.resample("D").mean()
25+
26+
assert len(result) == 2
27+
assert isinstance(result.index, DatetimeIndex)
28+
assert result.index.tz is not None
29+
assert not result.isna().any().any()
30+
31+
def test_resample_before_dst_boundary(self):
32+
df = DataFrame(
33+
{"value": [76.0, 42.0]},
34+
index=DatetimeIndex(
35+
[
36+
"2024-04-24 00:00:00",
37+
"2024-04-25 00:00:00",
38+
]
39+
).tz_localize("Africa/Cairo"),
40+
)
41+
42+
result = df.resample("D").mean()
43+
44+
assert len(result) == 2
45+
assert isinstance(result.index, DatetimeIndex)
46+
assert "Africa/Cairo" in str(result.index.tz)
47+
assert result.iloc[0, 0] == 76.0
48+
assert result.iloc[1, 0] == 42.0
49+
50+
@pytest.mark.parametrize("freq", ["2h", "6h", "12h"])
51+
def test_resample_various_freq(self, freq):
52+
df = DataFrame(
53+
{"value": [1, 2, 3, 4, 5]},
54+
index=DatetimeIndex(
55+
[
56+
"2024-04-25 22:00:00",
57+
"2024-04-25 23:00:00",
58+
"2024-04-26 01:00:00",
59+
"2024-04-26 02:00:00",
60+
"2024-04-26 03:00:00",
61+
]
62+
).tz_localize("Africa/Cairo", nonexistent="shift_forward"),
63+
)
64+
65+
result = df.resample(freq).mean()
66+
67+
assert isinstance(result, DataFrame)
68+
assert len(result) > 0
69+
assert not result.isna().all().any()
70+
71+
def test_resample_closed_label_combinations(self):
72+
df = DataFrame(
73+
{"value": [1, 2]},
74+
index=DatetimeIndex(
75+
[
76+
"2024-04-26 01:00:00",
77+
"2024-04-27 00:00:00",
78+
]
79+
).tz_localize("Africa/Cairo", nonexistent="shift_forward"),
80+
)
81+
82+
for closed in ["left", "right"]:
83+
for label in ["left", "right"]:
84+
result = df.resample("D", closed=closed, label=label).mean()
85+
assert len(result) >= 1
86+
assert not result.isna().all().any()
87+
88+
def test_resample_nonexistent_times(self):
89+
timestamps = [
90+
"2024-04-25 23:00:00",
91+
"2024-04-26 00:30:00",
92+
"2024-04-26 01:00:00",
93+
]
94+
95+
df = DataFrame(
96+
{"value": [1, 2, 3]},
97+
index=DatetimeIndex(timestamps).tz_localize(
98+
"Africa/Cairo", nonexistent="shift_forward"
99+
),
100+
)
101+
102+
result = df.resample("h").mean()
103+
104+
assert len(result) > 0
105+
assert isinstance(result, DataFrame)
106+
107+
def test_resample_empty_dataframe(self):
108+
df = DataFrame({"value": []}, index=DatetimeIndex([], tz="Africa/Cairo"))
109+
110+
result = df.resample("D").mean()
111+
112+
assert len(result) == 0
113+
assert isinstance(result.index, DatetimeIndex)
114+
115+
def test_resample_single_point(self):
116+
df = DataFrame(
117+
{"value": [42.0]},
118+
index=DatetimeIndex(["2024-04-26 12:00:00"]).tz_localize(
119+
"Africa/Cairo", nonexistent="shift_forward"
120+
),
121+
)
122+
123+
result = df.resample("D").mean()
124+
125+
assert len(result) == 1
126+
assert result.iloc[0, 0] == 42.0
127+
128+
129+
class TestResampleDSTMultipleTimezones:
130+
"""DST handling across multiple timezones."""
131+
132+
def test_resample_multiple_timezones(self):
133+
timezones = [
134+
("Africa/Cairo", "2024-04-26 01:00:00", "2024-04-27 00:00:00"),
135+
("Europe/London", "2024-03-31 01:00:00", "2024-04-01 00:00:00"),
136+
("America/New_York", "2024-03-10 01:00:00", "2024-03-11 00:00:00"),
137+
]
138+
139+
for tz, start, end in timezones:
140+
df = DataFrame(
141+
{"value": [1, 2]},
142+
index=DatetimeIndex([start, end]).tz_localize(
143+
tz, nonexistent="shift_forward", ambiguous=True
144+
),
145+
)
146+
147+
result = df.resample("D").mean()
148+
149+
assert len(result) >= 1
150+
assert isinstance(result.index, DatetimeIndex)
151+
assert result.index.tz is not None
152+
153+
154+
class TestResampleDSTEdgeCases:
155+
"""Edge cases around DST transitions."""
156+
157+
def test_resample_multiple_dst_days(self):
158+
df = DataFrame(
159+
{"value": [1, 2, 3, 4]},
160+
index=DatetimeIndex(
161+
[
162+
"2024-04-25 23:00:00",
163+
"2024-04-26 01:00:00",
164+
"2024-04-27 00:00:00",
165+
"2024-04-28 00:00:00",
166+
]
167+
).tz_localize("Africa/Cairo", nonexistent="shift_forward"),
168+
)
169+
170+
result = df.resample("D").mean()
171+
172+
assert len(result) >= 3
173+
174+
def test_resample_microsecond_precision(self):
175+
df = DataFrame(
176+
{"value": [1.1, 2.2]},
177+
index=DatetimeIndex(
178+
[
179+
"2024-04-26 01:00:00.123456",
180+
"2024-04-27 00:00:00.654321",
181+
]
182+
).tz_localize("Africa/Cairo", nonexistent="shift_forward"),
183+
)
184+
185+
result = df.resample("D").mean()
186+
187+
assert len(result) == 2
188+
189+
def test_resample_with_na_values(self):
190+
df = DataFrame(
191+
{"value": [1.0, np.nan, 3.0]},
192+
index=DatetimeIndex(
193+
[
194+
"2024-04-25 23:00:00",
195+
"2024-04-26 01:00:00",
196+
"2024-04-26 02:00:00",
197+
]
198+
).tz_localize("Africa/Cairo", nonexistent="shift_forward"),
199+
)
200+
201+
result = df.resample("h").mean()
202+
203+
assert len(result) > 0
204+
assert isinstance(result, DataFrame)
205+
206+
207+
class TestResampleDSTOriginalIssues:
208+
"""Tests reproducing the originally reported issues."""
209+
210+
def test_original_issue_1(self):
211+
df = DataFrame(
212+
{"value": [1, 2]},
213+
index=DatetimeIndex(
214+
[
215+
"2024-04-26 01:00:00",
216+
"2024-04-27 00:00:00",
217+
]
218+
).tz_localize("Africa/Cairo", nonexistent="shift_forward"),
219+
)
220+
221+
result = df.resample("D").mean()
222+
223+
assert len(result) > 0
224+
assert not result.isna().any().any()
225+
226+
def test_original_issue_2(self):
227+
df = DataFrame(
228+
{"value": [76.0, 42.0]},
229+
index=DatetimeIndex(
230+
[
231+
"2024-04-24 00:00:00",
232+
"2024-04-25 00:00:00",
233+
]
234+
).tz_localize("Africa/Cairo"),
235+
)
236+
237+
result = df.resample("D").mean()
238+
239+
assert len(result) > 0
240+
assert not result.isna().any().any()

0 commit comments

Comments
 (0)