Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,7 @@ I/O
- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)
- Bug in :meth:`DataFrame.from_records` ignoring ``columns`` and ``index`` parameters when ``data`` is an empty iterator and ``nrows=0``. (:issue:`61140`)
- Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`)
- Bug in :meth:`DataFrame.to_csv` / :meth:`Series.to_csv` formatting tz-aware datetimes (microseconds dropped, offset lacked colon); now consistently emits ``YYYY-MM-DD HH:MM:SS.ffffff+HH:MM`` when ``date_format=None``. (:issue:`62111`)
- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`)
Expand Down
58 changes: 58 additions & 0 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Sequence,
)
import csv as csvlib
from datetime import datetime as _pydatetime
import os
from typing import (
TYPE_CHECKING,
Expand All @@ -24,6 +25,8 @@
from pandas._typing import SequenceNotStr
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.common import is_object_dtype
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.generic import (
ABCDatetimeIndex,
ABCIndex,
Expand All @@ -47,6 +50,8 @@
npt,
)

from pandas.core.series import Series

from pandas.io.formats.format import DataFrameFormatter


Expand Down Expand Up @@ -312,11 +317,64 @@ def _save_body(self) -> None:
break
self._save_chunk(start_i, end_i)

# tz-aware CSV formatting helper
@staticmethod
def _csv_format_datetime_tz_ea(ser: Series, na_rep: str) -> Series:
"""
Consistent tz-aware formatting for ExtensionArray datetimes:
'YYYY-MM-DD HH:MM:SS.ffffff+HH:MM'
"""
# +HHMM → +HH:MM
s = ser.dt.strftime("%Y-%m-%d %H:%M:%S.%f%z")
s = s.str.replace(r"([+-]\d{2})(\d{2})$", r"\1:\2", regex=True)
Comment on lines +328 to +329
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not clear to me why this replace is necessary, can you give a case?

return s.fillna(na_rep)

# tz-aware CSV formatting helper
@staticmethod
def _csv_format_py_tz_aware_obj(ser: Series, na_rep: str) -> Series:
"""
For object-dtype Series containing stdlib tz-aware datetimes, render
with microseconds and colonized offset. Leave other objects untouched.
"""
if ser.empty:
return ser.astype(str)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why astype(str)?


vals = ser.to_numpy(object, copy=False)

def _is_tzaware_dt(x: object) -> bool:
if not isinstance(x, _pydatetime):
return False
tz = getattr(x, "tzinfo", None)
return tz is not None and tz.utcoffset(x) is not None

mask = np.fromiter(
(_is_tzaware_dt(x) for x in vals), dtype=bool, count=len(vals)
)
if mask.any():
out = vals.copy()
# isoformat gives 'YYYY-MM-DD HH:MM:SS.ffffff+HH:MM'
out[mask] = [
x.isoformat(sep=" ", timespec="microseconds") for x in out[mask]
]
ser = ser._constructor(out, index=ser.index, name=ser.name)

return ser.fillna(na_rep)

def _save_chunk(self, start_i: int, end_i: int) -> None:
# create the data for a chunk
slicer = slice(start_i, end_i)
df = self.obj.iloc[slicer]

# If user didn't set date_format, normalize tz-aware datetimes to a
# single canonical string form for CSV (GH 62111).
if self.date_format is None:
for col in df.columns:
col_dtype = df.dtypes[col]
if isinstance(col_dtype, DatetimeTZDtype):
df[col] = self._csv_format_datetime_tz_ea(df[col], self.na_rep)
elif is_object_dtype(col_dtype):
df[col] = self._csv_format_py_tz_aware_obj(df[col], self.na_rep)

res = df._get_values_for_csv(**self._number_format)
data = list(res._iter_column_arrays())

Expand Down
101 changes: 101 additions & 0 deletions pandas/tests/io/formats/test_to_csv.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from datetime import (
datetime,
timedelta,
timezone,
)
import io
import os
import sys
Expand Down Expand Up @@ -712,6 +717,102 @@ def test_to_csv_encoding_binary_handle(self, mode):
handle.seek(0)
assert handle.read().startswith(b'\xef\xbb\xbf""')

"""
tz-aware timestamps with/without microseconds should be written consistently
Checks if the .ffffff format is consistent, even when microseconds==0

GH 62111
"""
Comment on lines +720 to +725
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move this inside the test function.


def test_to_csv_tz_aware_consistent_microseconds_formatting_python_datetime(self):
df = DataFrame(
{
"timestamp": [
datetime(2025, 8, 14, 12, 34, 56, 0, tzinfo=timezone.utc),
datetime(2025, 8, 14, 12, 34, 56, 1, tzinfo=timezone.utc),
]
}
)
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, index=False, lineterminator="\n")
with open(path, encoding="utf-8") as f:
contents = f.read()

expected = (
"timestamp\n"
"2025-08-14 12:34:56.000000+00:00\n"
"2025-08-14 12:34:56.000001+00:00\n"
)
assert contents == expected

def test_to_csv_tz_aware_consistent_microseconds_formatting_timestamp(self):
df = DataFrame(
{
"timestamp": [
pd.Timestamp("2025-08-14 12:34:56+00:00"),
pd.Timestamp("2025-08-14 12:34:56.000001+00:00"),
]
}
)
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, index=False, lineterminator="\n")
with open(path, encoding="utf-8") as f:
contents = f.read()

expected = (
"timestamp\n"
"2025-08-14 12:34:56.000000+00:00\n"
"2025-08-14 12:34:56.000001+00:00\n"
)
assert contents == expected

def test_to_csv_tz_aware_respects_date_format_python_datetime(self):
# No microseconds in date_format; %z produces +0000 (no colon) by design.
df = DataFrame(
{
"timestamp": [
datetime(2025, 8, 14, 12, 34, 56, 0, tzinfo=timezone.utc),
datetime(2025, 8, 14, 12, 34, 56, 1, tzinfo=timezone.utc),
]
}
)
with tm.ensure_clean("test.csv") as path:
df.to_csv(
path,
index=False,
lineterminator="\n",
date_format="%Y-%m-%d %H:%M:%S%z",
)
with open(path, encoding="utf-8") as f:
contents = f.read()

expected = "timestamp\n2025-08-14 12:34:56+0000\n2025-08-14 12:34:56+0000\n"
assert contents == expected

def test_to_csv_tz_aware_consistent_microseconds_non_utc_offset_python_datetime(
self,
):
am_tz = timezone(timedelta(hours=4)) # +04:00 (Armenia / Asia/Yerevan)
df = DataFrame(
{
"timestamp": [
datetime(2025, 8, 14, 12, 34, 56, 0, tzinfo=am_tz),
datetime(2025, 8, 14, 12, 34, 56, 1, tzinfo=am_tz),
]
}
)
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, index=False, lineterminator="\n")
with open(path, encoding="utf-8") as f:
contents = f.read()

expected = (
"timestamp\n"
"2025-08-14 12:34:56.000000+04:00\n"
"2025-08-14 12:34:56.000001+04:00\n"
)
assert contents == expected


def test_to_csv_iterative_compression_name(compression):
# GH 38714
Expand Down
Loading