diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b94d82f3c9783..1e392c9bbcf8c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -953,6 +953,7 @@ I/O - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) - Bug in :meth:`DataFrame.from_records` ignoring ``columns`` and ``index`` parameters when ``data`` is an empty iterator and ``nrows=0``. (:issue:`61140`) - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) +- Bug in :meth:`DataFrame.to_csv` / :meth:`Series.to_csv` formatting tz-aware datetimes (microseconds dropped, offset lacked colon); now consistently emits ``YYYY-MM-DD HH:MM:SS.ffffff+HH:MM`` when ``date_format=None``. (:issue:`62111`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 1b9eb6303fe74..76180878d0c51 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,6 +11,7 @@ Sequence, ) import csv as csvlib +from datetime import datetime as _pydatetime import os from typing import ( TYPE_CHECKING, @@ -24,6 +25,8 @@ from pandas._typing import SequenceNotStr from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ( ABCDatetimeIndex, ABCIndex, @@ -47,6 +50,8 @@ npt, ) + from pandas.core.series import Series + from pandas.io.formats.format import DataFrameFormatter @@ -312,11 +317,64 @@ def _save_body(self) -> None: break self._save_chunk(start_i, end_i) + # tz-aware CSV formatting helper + @staticmethod + def _csv_format_datetime_tz_ea(ser: Series, na_rep: str) -> Series: + """ + Consistent tz-aware formatting for ExtensionArray datetimes: + 'YYYY-MM-DD HH:MM:SS.ffffff+HH:MM' + """ + # +HHMM → +HH:MM + s = ser.dt.strftime("%Y-%m-%d %H:%M:%S.%f%z") + s = s.str.replace(r"([+-]\d{2})(\d{2})$", r"\1:\2", regex=True) + return s.fillna(na_rep) + + # tz-aware CSV formatting helper + @staticmethod + def _csv_format_py_tz_aware_obj(ser: Series, na_rep: str) -> Series: + """ + For object-dtype Series containing stdlib tz-aware datetimes, render + with microseconds and colonized offset. Leave other objects untouched. + """ + if ser.empty: + return ser.astype(str) + + vals = ser.to_numpy(object, copy=False) + + def _is_tzaware_dt(x: object) -> bool: + if not isinstance(x, _pydatetime): + return False + tz = getattr(x, "tzinfo", None) + return tz is not None and tz.utcoffset(x) is not None + + mask = np.fromiter( + (_is_tzaware_dt(x) for x in vals), dtype=bool, count=len(vals) + ) + if mask.any(): + out = vals.copy() + # isoformat gives 'YYYY-MM-DD HH:MM:SS.ffffff+HH:MM' + out[mask] = [ + x.isoformat(sep=" ", timespec="microseconds") for x in out[mask] + ] + ser = ser._constructor(out, index=ser.index, name=ser.name) + + return ser.fillna(na_rep) + def _save_chunk(self, start_i: int, end_i: int) -> None: # create the data for a chunk slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] + # If user didn't set date_format, normalize tz-aware datetimes to a + # single canonical string form for CSV (GH 62111). + if self.date_format is None: + for col in df.columns: + col_dtype = df.dtypes[col] + if isinstance(col_dtype, DatetimeTZDtype): + df[col] = self._csv_format_datetime_tz_ea(df[col], self.na_rep) + elif is_object_dtype(col_dtype): + df[col] = self._csv_format_py_tz_aware_obj(df[col], self.na_rep) + res = df._get_values_for_csv(**self._number_format) data = list(res._iter_column_arrays()) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 52f521d0d36eb..e2c8f55188154 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,8 @@ +from datetime import ( + datetime, + timedelta, + timezone, +) import io import os import sys @@ -712,6 +717,102 @@ def test_to_csv_encoding_binary_handle(self, mode): handle.seek(0) assert handle.read().startswith(b'\xef\xbb\xbf""') + """ + tz-aware timestamps with/without microseconds should be written consistently + Checks if the .ffffff format is consistent, even when microseconds==0 + + GH 62111 + """ + + def test_to_csv_tz_aware_consistent_microseconds_formatting_python_datetime(self): + df = DataFrame( + { + "timestamp": [ + datetime(2025, 8, 14, 12, 34, 56, 0, tzinfo=timezone.utc), + datetime(2025, 8, 14, 12, 34, 56, 1, tzinfo=timezone.utc), + ] + } + ) + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, index=False, lineterminator="\n") + with open(path, encoding="utf-8") as f: + contents = f.read() + + expected = ( + "timestamp\n" + "2025-08-14 12:34:56.000000+00:00\n" + "2025-08-14 12:34:56.000001+00:00\n" + ) + assert contents == expected + + def test_to_csv_tz_aware_consistent_microseconds_formatting_timestamp(self): + df = DataFrame( + { + "timestamp": [ + pd.Timestamp("2025-08-14 12:34:56+00:00"), + pd.Timestamp("2025-08-14 12:34:56.000001+00:00"), + ] + } + ) + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, index=False, lineterminator="\n") + with open(path, encoding="utf-8") as f: + contents = f.read() + + expected = ( + "timestamp\n" + "2025-08-14 12:34:56.000000+00:00\n" + "2025-08-14 12:34:56.000001+00:00\n" + ) + assert contents == expected + + def test_to_csv_tz_aware_respects_date_format_python_datetime(self): + # No microseconds in date_format; %z produces +0000 (no colon) by design. + df = DataFrame( + { + "timestamp": [ + datetime(2025, 8, 14, 12, 34, 56, 0, tzinfo=timezone.utc), + datetime(2025, 8, 14, 12, 34, 56, 1, tzinfo=timezone.utc), + ] + } + ) + with tm.ensure_clean("test.csv") as path: + df.to_csv( + path, + index=False, + lineterminator="\n", + date_format="%Y-%m-%d %H:%M:%S%z", + ) + with open(path, encoding="utf-8") as f: + contents = f.read() + + expected = "timestamp\n2025-08-14 12:34:56+0000\n2025-08-14 12:34:56+0000\n" + assert contents == expected + + def test_to_csv_tz_aware_consistent_microseconds_non_utc_offset_python_datetime( + self, + ): + am_tz = timezone(timedelta(hours=4)) # +04:00 (Armenia / Asia/Yerevan) + df = DataFrame( + { + "timestamp": [ + datetime(2025, 8, 14, 12, 34, 56, 0, tzinfo=am_tz), + datetime(2025, 8, 14, 12, 34, 56, 1, tzinfo=am_tz), + ] + } + ) + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, index=False, lineterminator="\n") + with open(path, encoding="utf-8") as f: + contents = f.read() + + expected = ( + "timestamp\n" + "2025-08-14 12:34:56.000000+04:00\n" + "2025-08-14 12:34:56.000001+04:00\n" + ) + assert contents == expected + def test_to_csv_iterative_compression_name(compression): # GH 38714