From 3c1ae7b0a566cc3feaa012b8f5264c3078ac82c1 Mon Sep 17 00:00:00 2001 From: Li Date: Fri, 31 Oct 2025 04:18:43 +0000 Subject: [PATCH 1/5] add test for issue #50942 --- pandas/tests/indexing/test_loc.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index de2d914aab229..1a57b89f6fe29 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -258,6 +258,25 @@ def test_loc_getitem_single_boolean_arg(self, obj, key, exp): else: assert res == exp + @pytest.mark.parametrize( + "obj", + [ + DataFrame({"A": [datetime(2025, 10, 30)]}), + DataFrame({"A": [Timestamp(2025, 10, 30)] * 2}), + DataFrame({"A": [Timedelta(1)]}), + DataFrame({"A": [Timedelta(1), Timedelta(2)]}), + ], + ) + def test_loc_empty_slice_assignment_with_datetime(self, obj): + # issue #50942 + # empty slice assignment with datetime or timedelta should not raise exceptions + mask = [False] * len(obj) + try: + obj.loc[mask] = obj + assert True + except Exception: + pytest.fail("loc empty slice assignment raised Exception unexpectedly!") + class TestLocBaseIndependent: # Tests for loc that do not depend on subclassing Base From a44af1ad4893b27c0b417eff4fa0a6d6f57f0257 Mon Sep 17 00:00:00 2001 From: Li Date: Tue, 4 Nov 2025 00:35:40 +0000 Subject: [PATCH 2/5] merge with main --- .pre-commit-config.yaml | 10 +- README.md | 2 +- doc/source/user_guide/groupby.rst | 2 +- doc/source/whatsnew/v3.0.0.rst | 9 + pandas/_libs/tslibs/offsets.pyx | 17 +- pandas/conftest.py | 2 + pandas/core/arrays/masked.py | 261 +++++++++++++++++- pandas/core/frame.py | 59 ++-- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/datetimes.py | 6 +- pandas/core/interchange/from_dataframe.py | 24 +- pandas/core/series.py | 18 +- pandas/tests/frame/methods/test_combine.py | 16 ++ .../tests/frame/methods/test_combine_first.py | 12 + pandas/tests/frame/methods/test_join.py | 24 ++ .../indexes/datetimes/methods/test_asof.py | 16 ++ .../indexes/datetimes/test_date_range.py | 35 ++- pandas/tests/interchange/test_impl.py | 158 +++++++---- .../interchange/test_spec_conformance.py | 31 ++- .../io/parser/common/test_file_buffer_url.py | 62 ++--- pandas/tests/resample/test_datetime_index.py | 11 + .../series/methods/test_combine_first.py | 18 +- .../tseries/frequencies/test_inference.py | 14 + pandas/tseries/frequencies.py | 9 + web/pandas/config.yml | 1 + .../pdeps/0010-required-pyarrow-dependency.md | 7 + web/pandas/static/css/pandas.css | 63 +++++ 27 files changed, 737 insertions(+), 154 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dd3fe8b916a57..265f647d156ce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.13.3 + rev: v0.14.3 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -46,7 +46,7 @@ repos: - id: codespell types_or: [python, rst, markdown, cython, c] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.17.0 + rev: v0.18.1 hooks: - id: cython-lint - id: double-quote-cython-strings @@ -67,11 +67,11 @@ repos: - id: trailing-whitespace args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort - rev: 6.1.0 + rev: 7.0.0 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.20.0 + rev: v3.21.0 hooks: - id: pyupgrade args: [--py311-plus] @@ -87,7 +87,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v1.0.0 + rev: v1.0.1 hooks: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] diff --git a/README.md b/README.md index d15b36c151ff7..c6e0a4b319930 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ If you are simply looking to start working with the pandas codebase, navigate to You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas). -Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it! +Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’... you can do something about it! Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Slack](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack). diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 4ec34db6ed959..40369bd40cdb5 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -137,7 +137,7 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``. -The above GroupBy will split the DataFrame on its index (rows). To split by columns, first do +DataFrame groupby always operates along axis 0 (rows). To split by columns, first do a transpose: .. ipython:: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 38755aef32b85..752d08a526d8c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -737,7 +737,9 @@ Other Deprecations - Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.shift` and :meth:`DataFrame.shift` (:issue:`53802`) - Deprecated backward-compatibility behavior for :meth:`DataFrame.select_dtypes` matching "str" dtype when ``np.object_`` is specified (:issue:`61916`) - Deprecated option "future.no_silent_downcasting", as it is no longer used. In a future version accessing this option will raise (:issue:`59502`) +- Deprecated silent casting of non-datetime 'other' to datetime in :meth:`Series.combine_first` (:issue:`62931`) - Deprecated slicing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` using a ``datetime.date`` object, explicitly cast to :class:`Timestamp` instead (:issue:`35830`) +- Deprecated support for the Dataframe Interchange Protocol (:issue:`56732`) - Deprecated the 'inplace' keyword from :meth:`Resampler.interpolate`, as passing ``True`` raises ``AttributeError`` (:issue:`58690`) .. --------------------------------------------------------------------------- @@ -960,6 +962,7 @@ Categorical ^^^^^^^^^^^ - Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`) - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) +- Bug in :func:`bdate_range` raising ``ValueError`` with frequency ``freq="cbh"`` (:issue:`62849`) - Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`) - Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`) - Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) @@ -974,6 +977,7 @@ Datetimelike - Bug in :class:`Timestamp` constructor failing to raise when given a ``np.datetime64`` object with non-standard unit (:issue:`25611`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) +- Bug in :func:`infer_freq` with a :class:`Series` with :class:`ArrowDtype` timestamp dtype incorrectly raising ``TypeError`` (:issue:`58403`) - Bug in :func:`to_datetime` where passing an ``lxml.etree._ElementUnicodeResult`` together with ``format`` raised ``TypeError``. Now subclasses of ``str`` are handled. (:issue:`60933`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) @@ -981,6 +985,7 @@ Datetimelike - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DateOffset.rollback` (and subclass methods) with ``normalize=True`` rolling back one offset too long (:issue:`32616`) +- Bug in :meth:`DatetimeIndex.asof` with a string key giving incorrect results (:issue:`50946`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) @@ -1177,16 +1182,20 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) +- Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`) Reshaping ^^^^^^^^^ - Bug in :func:`concat` with mixed integer and bool dtypes incorrectly casting the bools to integers (:issue:`45101`) - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) - Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`) +- Bug in :meth:`DataFrame.combine_first` with non-unique columns incorrectly raising (:issue:`29135`) +- Bug in :meth:`DataFrame.combine` with non-unique columns incorrectly raising (:issue:`51340`) - Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) +- Bug in :meth:`Series.combine_first` incorrectly replacing ``None`` entries with ``NaN`` (:issue:`58977`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.unstack` raising an error with indexes containing ``NaN`` with ``sort=False`` (:issue:`61221`) - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 8fb5e739d3a4e..be86118a2b9e2 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -5688,18 +5688,27 @@ def shift_month(stamp: datetime, months: int, day_opt: object = None) -> datetim cdef: int year, month, day int days_in_month, dy + npy_datetimestruct dts + + if isinstance(stamp, _Timestamp): + creso = (<_Timestamp>stamp)._creso + val = (<_Timestamp>stamp)._value + pandas_datetime_to_datetimestruct(val, creso, &dts) + else: + # Plain datetime/date + pydate_to_dtstruct(stamp, &dts) - dy = (stamp.month + months) // 12 - month = (stamp.month + months) % 12 + dy = (dts.month + months) // 12 + month = (dts.month + months) % 12 if month == 0: month = 12 dy -= 1 - year = stamp.year + dy + year = dts.year + dy if day_opt is None: days_in_month = get_days_in_month(year, month) - day = min(stamp.day, days_in_month) + day = min(dts.day, days_in_month) elif day_opt == "start": day = 1 elif day_opt == "end": diff --git a/pandas/conftest.py b/pandas/conftest.py index 82501cae4634d..7fe4ec7a5ee4f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -135,12 +135,14 @@ def pytest_collection_modifyitems(items, config) -> None: # Warnings from doctests that can be ignored; place reason in comment above. # Each entry specifies (path, message) - see the ignore_doctest_warning function ignored_doctest_warnings = [ + ("api.interchange.from_dataframe", ".*Interchange Protocol is deprecated"), ("is_int64_dtype", "is_int64_dtype is deprecated"), ("is_interval_dtype", "is_interval_dtype is deprecated"), ("is_period_dtype", "is_period_dtype is deprecated"), ("is_datetime64tz_dtype", "is_datetime64tz_dtype is deprecated"), ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), + ("DataFrame.__dataframe__", "Interchange Protocol is deprecated"), ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), ("DataFrameGroupBy.corrwith", "DataFrameGroupBy.corrwith is deprecated"), ("NDFrame.replace", "Series.replace without 'value'"), diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index cdba53662e6fa..276ccbdc76fdd 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -25,7 +25,6 @@ is_platform_windows, ) from pandas.errors import AbstractMethodError -from pandas.util._decorators import doc from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import maybe_downcast_to_dtype @@ -169,8 +168,15 @@ def _cast_pointwise_result(self, values) -> ArrayLike: return result @classmethod - @doc(ExtensionArray._empty) def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self: + """ + Create an ExtensionArray with the given shape and dtype. + + See also + -------- + ExtensionDtype.empty + ExtensionDtype.empty is the 'official' public version of this API. + """ dtype = cast(BaseMaskedDtype, dtype) values: np.ndarray = np.empty(shape, dtype=dtype.type) values.fill(dtype._internal_fill_value) @@ -252,8 +258,44 @@ def _pad_or_backfill( new_values = self return new_values - @doc(ExtensionArray.fillna) def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self: + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like "value" can be given. It's expected + that the array-like have the same length as 'self'. + limit : int, default None + The maximum number of entries where NA values will be filled. + copy : bool, default True + Whether to make a copy of the data before filling. If False, then + the original should be modified and no new memory should be allocated. + For ExtensionArray subclasses that cannot do this, it is at the + author's discretion whether to ignore "copy=False" or to raise. + + Returns + ------- + ExtensionArray + With NA/NaN filled. + + See Also + -------- + api.extensions.ExtensionArray.dropna : Return ExtensionArray without + NA values. + api.extensions.ExtensionArray.isna : A 1-D array indicating if + each value is missing. + + Examples + -------- + >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) + >>> arr.fillna(0) + + [0, 0, 2, 3, 0, 0] + Length: 6, dtype: Int64 + """ mask = self._mask if limit is not None and limit < len(self): modify = mask.cumsum() > limit @@ -548,8 +590,30 @@ def to_numpy( data = self._data.astype(dtype, copy=copy) return data - @doc(ExtensionArray.tolist) def tolist(self) -> list: + """ + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + Returns + ------- + list + Python list of values in array. + + See Also + -------- + Index.to_list: Return a list of the values in the Index. + Series.to_list: Return a list of the values in the Series. + + Examples + -------- + >>> arr = pd.array([1, 2, 3]) + >>> arr.tolist() + [1, 2, 3] + """ if self.ndim > 1: return [x.tolist() for x in self] dtype = None if self._hasna else self._data.dtype @@ -1075,10 +1139,37 @@ def _rank( return FloatingArray(result, mask=mask) - @doc(ExtensionArray.duplicated) def duplicated( self, keep: Literal["first", "last", False] = "first" ) -> npt.NDArray[np.bool_]: + """ + Return boolean ndarray denoting duplicate values. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + ndarray[bool] + With true in indices where elements are duplicated and false otherwise. + + See Also + -------- + DataFrame.duplicated : Return boolean Series denoting + duplicate rows. + Series.duplicated : Indicate duplicate Series values. + api.extensions.ExtensionArray.unique : Compute the ExtensionArray + of unique values. + + Examples + -------- + >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() + array([False, True, False, False, True]) + """ values = self._data mask = self._mask return algos.duplicated(values, keep=keep, mask=mask) @@ -1094,13 +1185,56 @@ def unique(self) -> Self: uniques, mask = algos.unique_with_mask(self._data, self._mask) return self._simple_new(uniques, mask) - @doc(ExtensionArray.searchsorted) def searchsorted( self, value: NumpyValueArrayLike | ExtensionArray, side: Literal["left", "right"] = "left", sorter: NumpySorter | None = None, ) -> npt.NDArray[np.intp] | np.intp: + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted array `self` (a) such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + Assuming that `self` is sorted: + + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ + + Parameters + ---------- + value : array-like, list or scalar + Value(s) to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array-like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + array of ints or int + If value is array-like, array of insertion points. + If value is scalar, a single integer. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + + Examples + -------- + >>> arr = pd.array([1, 2, 3, 5]) + >>> arr.searchsorted([4]) + array([3]) + """ if self._hasna: raise ValueError( "searchsorted requires array to be sorted, which is impossible " @@ -1111,11 +1245,56 @@ def searchsorted( # Base class searchsorted would cast to object, which is *much* slower. return self._data.searchsorted(value, side=side, sorter=sorter) - @doc(ExtensionArray.factorize) def factorize( self, use_na_sentinel: bool = True, ) -> tuple[np.ndarray, ExtensionArray]: + """ + Encode the extension array as an enumerated type. + + Parameters + ---------- + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. + + .. versionadded:: 1.5.0 + + Returns + ------- + codes : ndarray + An integer NumPy array that's an indexer into the original + ExtensionArray. + uniques : ExtensionArray + An ExtensionArray containing the unique values of `self`. + + .. note:: + + uniques will *not* contain an entry for the NA value of + the ExtensionArray if there are any missing values present + in `self`. + + See Also + -------- + factorize : Top-level factorize method that dispatches here. + + Notes + ----- + :meth:`pandas.factorize` offers a `sort` keyword as well. + + Examples + -------- + >>> idx1 = pd.PeriodIndex( + ... ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + ... freq="M", + ... ) + >>> arr, idx = idx1.factorize() + >>> arr + array([0, 0, 1, 1, 2, 2]) + >>> idx + PeriodIndex(['2014-01', '2014-02', '2014-03'], dtype='period[M]') + """ arr = self._data mask = self._mask @@ -1148,8 +1327,38 @@ def factorize( return codes, uniques_ea - @doc(ExtensionArray._values_for_argsort) def _values_for_argsort(self) -> np.ndarray: + """ + Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort : Return the indices that would sort this array. + + Notes + ----- + The caller is responsible for *not* modifying these values in-place, so + it is safe for implementers to give views on ``self``. + + Functions that use this (e.g. ``ExtensionArray.argsort``) should ignore + entries with missing values in the original array (according to + ``self.isna()``). This means that the corresponding entries in the returned + array don't need to be modified to sort correctly. + + Examples + -------- + In most cases, this is the underlying Numpy array of the ``ExtensionArray``: + + >>> arr = pd.array([1, 2, 3]) + >>> arr._values_for_argsort() + array([1, 2, 3]) + """ return self._data def value_counts(self, dropna: bool = True) -> Series: @@ -1198,8 +1407,42 @@ def _mode(self, dropna: bool = True) -> Self: result = type(self)(result, res_mask) return result[result.argsort()] - @doc(ExtensionArray.equals) def equals(self, other) -> bool: + """ + Return if another array is equivalent to this array. + + Equivalent means that both arrays have the same shape and dtype, and + all values compare equal. Missing values in the same location are + considered equal (in contrast with normal equality). + + Parameters + ---------- + other : ExtensionArray + Array to compare to this Array. + + Returns + ------- + boolean + Whether the arrays are equivalent. + + See Also + -------- + numpy.array_equal : Equivalent method for numpy array. + Series.equals : Equivalent method for Series. + DataFrame.equals : Equivalent method for DataFrame. + + Examples + -------- + >>> arr1 = pd.array([1, 2, np.nan]) + >>> arr2 = pd.array([1, 2, np.nan]) + >>> arr1.equals(arr2) + True + + >>> arr1 = pd.array([1, 3, np.nan]) + >>> arr2 = pd.array([1, 2, np.nan]) + >>> arr1.equals(arr2) + False + """ if type(self) != type(other): return False if other.dtype != self.dtype: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 68ea6795d47dd..a35ef122ed512 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -916,6 +916,14 @@ def __dataframe__( """ Return the dataframe interchange object implementing the interchange protocol. + .. deprecated:: 3.0.0 + + The Dataframe Interchange Protocol is deprecated. + For dataframe-agnostic code, you may want to look into: + + - `Arrow PyCapsule Interface `_ + - `Narwhals `_ + .. note:: For new development, we highly recommend using the Arrow C Data Interface @@ -970,7 +978,14 @@ def __dataframe__( These methods (``column_names``, ``select_columns_by_name``) should work for any dataframe library which implements the interchange protocol. """ - + warnings.warn( + "The Dataframe Interchange Protocol is deprecated.\n" + "For dataframe-agnostic code, you may want to look into:\n" + "- Arrow PyCapsule Interface: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html\n" + "- Narwhals: https://github.com/narwhals-dev/narwhals\n", + Pandas4Warning, + stacklevel=find_stack_level(), + ) from pandas.core.interchange.dataframe import PandasDataFrameXchg return PandasDataFrameXchg(self, allow_copy=allow_copy) @@ -9038,16 +9053,6 @@ def combine( 0 0 -5.0 1 0 4.0 - However, if the same element in both dataframes is None, that None - is preserved - - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [None, 3]}) - >>> df1.combine(df2, take_smaller, fill_value=-5) - A B - 0 0 -5.0 - 1 0 3.0 - Example that demonstrates the use of `overwrite` and behavior when the axis differ between the dataframes. @@ -9106,11 +9111,14 @@ def combine( # preserve column order new_columns = self.columns.union(other_columns, sort=False) + this = this.reindex(new_columns, axis=1) + other = other.reindex(new_columns, axis=1) + do_fill = fill_value is not None result = {} - for col in new_columns: - series = this[col] - other_series = other[col] + for i in range(this.shape[1]): + series = this.iloc[:, i] + other_series = other.iloc[:, i] this_dtype = series.dtype other_dtype = other_series.dtype @@ -9121,7 +9129,7 @@ def combine( # don't overwrite columns unnecessarily # DO propagate if this column is not in the intersection if not overwrite and other_mask.all(): - result[col] = this[col].copy() + result[i] = series.copy() continue if do_fill: @@ -9130,7 +9138,7 @@ def combine( series[this_mask] = fill_value other_series[other_mask] = fill_value - if col not in self.columns: + if new_columns[i] not in self.columns: # If self DataFrame does not have col in other DataFrame, # try to promote series, which is all NaN, as other_dtype. new_dtype = other_dtype @@ -9155,10 +9163,10 @@ def combine( arr, new_dtype ) - result[col] = arr + result[i] = arr - # convert_objects just in case - frame_result = self._constructor(result, index=new_index, columns=new_columns) + frame_result = self._constructor(result, index=new_index) + frame_result.columns = new_columns return frame_result.__finalize__(self, method="combine") def combine_first(self, other: DataFrame) -> DataFrame: @@ -9222,9 +9230,14 @@ def combiner(x: Series, y: Series): combined = self.combine(other, combiner, overwrite=False) dtypes = { + # Check for isinstance(..., (np.dtype, ExtensionDtype)) + # to prevent raising on non-unique columns see GH#29135. + # Note we will just not-cast in these cases. col: find_common_type([self.dtypes[col], other.dtypes[col]]) for col in self.columns.intersection(other.columns) - if combined.dtypes[col] != self.dtypes[col] + if isinstance(combined.dtypes[col], (np.dtype, ExtensionDtype)) + and isinstance(self.dtypes[col], (np.dtype, ExtensionDtype)) + and combined.dtypes[col] != self.dtypes[col] } if dtypes: @@ -9432,7 +9445,7 @@ def groupby( index. If a dict or Series is passed, the Series or dict VALUES will be used to determine the groups (the Series' values are first aligned; see ``.align()`` method). If a list or ndarray of length - equal to the selected axis is passed (see the `groupby user guide + equal to the number of rows is passed (see the `groupby user guide `_), the values are used as-is to determine the groups. A label or list of labels may be passed to group by the columns in ``self``. @@ -13820,8 +13833,8 @@ def quantile( 0.1 1 1 0.5 3 100 - Specifying `numeric_only=False` will also compute the quantile of - datetime and timedelta data. + Specifying `numeric_only=False` will compute the quantiles for all + columns. >>> df = pd.DataFrame( ... { diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6fafbd9590143..72f7a1e086b60 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4168,7 +4168,7 @@ def reindex( limit : int, optional Maximum number of consecutive labels in ``target`` to match for inexact matches. - tolerance : int or float, optional + tolerance : int, float, or list-like, optional Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. @@ -5675,7 +5675,7 @@ def asof(self, label): return self._na_value else: if isinstance(loc, slice): - loc = loc.indices(len(self))[-1] + return self[loc][-1] return self[loc] diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6451e55f7fc4d..36bd5df8cf20b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1133,12 +1133,14 @@ def bdate_range( msg = "freq must be specified for bdate_range; use date_range instead" raise TypeError(msg) - if isinstance(freq, str) and freq.startswith("C"): + if isinstance(freq, str) and freq.upper().startswith("C"): + msg = f"invalid custom frequency string: {freq}" + if freq == "CBH": + raise ValueError(f"{msg}, did you mean cbh?") try: weekmask = weekmask or "Mon Tue Wed Thu Fri" freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask) except (KeyError, TypeError) as err: - msg = f"invalid custom frequency string: {freq}" raise ValueError(msg) from err elif holidays or weekmask: msg = ( diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index bcbeb546f845c..04278c0e7856d 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -6,13 +6,16 @@ Any, overload, ) +import warnings import numpy as np from pandas._config import using_string_dtype from pandas.compat._optional import import_optional_dependency +from pandas.errors import Pandas4Warning from pandas.util._decorators import set_module +from pandas.util._exceptions import find_stack_level import pandas as pd from pandas.core.interchange.dataframe_protocol import ( @@ -47,6 +50,9 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: From pandas 3.0 onwards, `from_dataframe` uses the PyCapsule Interface, only falling back to the interchange protocol if that fails. + From pandas 4.0 onwards, that fallback will no longer be available and only + the PyCapsule Interface will be used. + .. warning:: Due to severe implementation issues, we recommend only considering using the @@ -99,7 +105,14 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: pa = import_optional_dependency("pyarrow", min_version="14.0.0") except ImportError: # fallback to _from_dataframe - pass + warnings.warn( + "Conversion using Arrow PyCapsule Interface failed due to " + "missing PyArrow>=14 dependency, falling back to (deprecated) " + "interchange protocol. We recommend that you install " + "PyArrow>=14.0.0.", + UserWarning, + stacklevel=find_stack_level(), + ) else: try: return pa.table(df).to_pandas(zero_copy_only=not allow_copy) @@ -109,6 +122,15 @@ def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") + warnings.warn( + "The Dataframe Interchange Protocol is deprecated.\n" + "For dataframe-agnostic code, you may want to look into:\n" + "- Arrow PyCapsule Interface: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html\n" + "- Narwhals: https://github.com/narwhals-dev/narwhals\n", + Pandas4Warning, + stacklevel=find_stack_level(), + ) + return _from_dataframe( df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy ) diff --git a/pandas/core/series.py b/pandas/core/series.py index fe71a3ab91933..1a8645cf1815d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -87,7 +87,6 @@ ) from pandas.core.dtypes.dtypes import ( ExtensionDtype, - SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -3112,8 +3111,8 @@ def combine( Combine the Series and `other` using `func` to perform elementwise selection for combined Series. - `fill_value` is assumed when value is missing at some index - from one of the two objects being combined. + `fill_value` is assumed when value is not present at some index + from one of the two Series being combined. Parameters ---------- @@ -3254,9 +3253,6 @@ def combine_first(self, other) -> Series: if self.dtype == other.dtype: if self.index.equals(other.index): return self.mask(self.isna(), other) - elif self._can_hold_na and not isinstance(self.dtype, SparseDtype): - this, other = self.align(other, join="outer") - return this.mask(this.isna(), other) new_index = self.index.union(other.index) @@ -3271,6 +3267,16 @@ def combine_first(self, other) -> Series: if this.dtype.kind == "M" and other.dtype.kind != "M": # TODO: try to match resos? other = to_datetime(other) + warnings.warn( + # GH#62931 + "Silently casting non-datetime 'other' to datetime in " + "Series.combine_first is deprecated and will be removed " + "in a future version. Explicitly cast before calling " + "combine_first instead.", + Pandas4Warning, + stacklevel=find_stack_level(), + ) + combined = concat([this, other]) combined = combined.reindex(new_index) return combined.__finalize__(self, method="combine_first") diff --git a/pandas/tests/frame/methods/test_combine.py b/pandas/tests/frame/methods/test_combine.py index bc6a67e4e1f32..f7631f3a2adda 100644 --- a/pandas/tests/frame/methods/test_combine.py +++ b/pandas/tests/frame/methods/test_combine.py @@ -45,3 +45,19 @@ def test_combine_generic(self, float_frame): ) tm.assert_frame_equal(chunk, exp) tm.assert_frame_equal(chunk2, exp) + + def test_combine_nonunique_columns(self): + # GH#51340 + + df = pd.DataFrame({"A": range(5), "B": range(5)}) + df.columns = ["A", "A"] + + other = df.copy() + df.iloc[1, :] = None + + def combiner(a, b): + return b + + result = df.combine(other, combiner) + expected = other.astype("float64") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index e93684b4dc90f..e97bb2a98a390 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -413,6 +413,18 @@ def test_combine_first_preserve_EA_precision(self, wide_val, dtype): expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype) tm.assert_frame_equal(result, expected) + def test_combine_first_non_unique_columns(self): + # GH#29135 + df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"]) + df2 = DataFrame( + [[5, 6, 7], [8, 9, np.nan]], columns=["P", "Q", "Q"], index=["A", "B"] + ) + result = df1.combine_first(df2) + expected = DataFrame( + [[1, 6.0, 7.0], [3, 4.0, 4.0]], index=["A", "B"], columns=["P", "Q", "Q"] + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "scalar1, scalar2", diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index aaa9485cab580..fa2e2375966f2 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -575,3 +575,27 @@ def test_frame_join_tzaware(self): tm.assert_index_equal(result.index, expected) assert result.index.tz.key == "US/Central" + + def test_frame_join_categorical_index(self): + # GH 61675 + cat_data = pd.Categorical( + [3, 4], + categories=pd.Series([2, 3, 4, 5], dtype="Int64"), + ordered=True, + ) + values1 = "a b".split() + values2 = "foo bar".split() + df1 = DataFrame({"hr": cat_data, "values1": values1}).set_index("hr") + df2 = DataFrame({"hr": cat_data, "values2": values2}).set_index("hr") + df1.columns = pd.CategoricalIndex([4], dtype=cat_data.dtype, name="other_hr") + df2.columns = pd.CategoricalIndex([3], dtype=cat_data.dtype, name="other_hr") + + df_joined = df1.join(df2) + expected = DataFrame( + {"hr": cat_data, "values1": values1, "values2": values2} + ).set_index("hr") + expected.columns = pd.CategoricalIndex( + [4, 3], dtype=cat_data.dtype, name="other_hr" + ) + + tm.assert_frame_equal(df_joined, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_asof.py b/pandas/tests/indexes/datetimes/methods/test_asof.py index dc92f533087bc..41415f2a37337 100644 --- a/pandas/tests/indexes/datetimes/methods/test_asof.py +++ b/pandas/tests/indexes/datetimes/methods/test_asof.py @@ -1,6 +1,7 @@ from datetime import timedelta from pandas import ( + DatetimeIndex, Index, Timestamp, date_range, @@ -28,3 +29,18 @@ def test_asof(self): dt = index[0].to_pydatetime() assert isinstance(index.asof(dt), Timestamp) + + def test_asof_datetime_string(self): + # GH#50946 + + dti = date_range("2021-08-05", "2021-08-10", freq="1D") + + key = "2021-08-09" + res = dti.asof(key) + exp = dti[4] + assert res == exp + + # add a non-midnight time caused a bug + dti2 = DatetimeIndex(list(dti) + ["2021-08-11 00:00:01"]) + res = dti2.asof(key) + assert res == exp diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 45f43f9bf9760..b179f1b272fac 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -1216,7 +1216,7 @@ def test_cdaterange_holidays_weekmask_requires_freqstr(self): ) @pytest.mark.parametrize( - "freq", [freq for freq in prefix_mapping if freq.startswith("C")] + "freq", [freq for freq in prefix_mapping if freq.upper().startswith("C")] ) def test_all_custom_freq(self, freq): # should not raise @@ -1280,6 +1280,39 @@ def test_data_range_custombusinessday_partial_time(self, unit): ) tm.assert_index_equal(result, expected) + def test_cdaterange_cbh(self): + # GH#62849 + result = bdate_range( + "2009-03-13", + "2009-03-15", + freq="cbh", + weekmask="Mon Wed Fri", + holidays=["2009-03-14"], + ) + expected = DatetimeIndex( + [ + "2009-03-13 09:00:00", + "2009-03-13 10:00:00", + "2009-03-13 11:00:00", + "2009-03-13 12:00:00", + "2009-03-13 13:00:00", + "2009-03-13 14:00:00", + "2009-03-13 15:00:00", + "2009-03-13 16:00:00", + ], + dtype="datetime64[ns]", + freq="cbh", + ) + tm.assert_index_equal(result, expected) + + def test_cdaterange_deprecated_error_CBH(self): + # GH#62849 + msg = "invalid custom frequency string: CBH, did you mean cbh?" + with pytest.raises(ValueError, match=msg): + bdate_range( + START, END, freq="CBH", weekmask="Mon Wed Fri", holidays=["2009-03-14"] + ) + class TestDateRangeNonNano: def test_date_range_reso_validation(self): diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 5b7564e77d0ab..3551cbc52b755 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -31,7 +31,8 @@ def test_categorical_dtype(data): } df = pd.DataFrame({"A": (data_categorical[data[0]])}) - col = df.__dataframe__().get_column_by_name("A") + with tm.assert_produces_warning(match="Interchange"): + col = df.__dataframe__().get_column_by_name("A") assert col.dtype[0] == DtypeKind.CATEGORICAL assert col.null_count == 0 assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1) @@ -44,7 +45,8 @@ def test_categorical_dtype(data): desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"]) ) - tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) + with tm.assert_produces_warning(match="Interchange"): + tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) def test_categorical_pyarrow(): @@ -54,7 +56,8 @@ def test_categorical_pyarrow(): arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] table = pa.table({"weekday": pa.array(arr).dictionary_encode()}) exchange_df = table.__dataframe__() - result = from_dataframe(exchange_df) + with tm.assert_produces_warning(match="Interchange"): + result = from_dataframe(exchange_df) weekday = pd.Categorical( arr, categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] ) @@ -72,7 +75,8 @@ def test_empty_categorical_pyarrow(): arr = [None] table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()}) exchange_df = table.__dataframe__() - result = pd.api.interchange.from_dataframe(exchange_df) + with tm.assert_produces_warning(match="Interchange"): + result = pd.api.interchange.from_dataframe(exchange_df) expected = pd.DataFrame({"arr": pd.Categorical([np.nan])}) tm.assert_frame_equal(result, expected) @@ -84,12 +88,15 @@ def test_large_string_pyarrow(): arr = ["Mon", "Tue"] table = pa.table({"weekday": pa.array(arr, "large_string")}) exchange_df = table.__dataframe__() - result = from_dataframe(exchange_df) + with tm.assert_produces_warning(match="Interchange"): + result = from_dataframe(exchange_df) expected = pd.DataFrame({"weekday": ["Mon", "Tue"]}) tm.assert_frame_equal(result, expected) # check round-trip - assert pa.Table.equals(pa.interchange.from_dataframe(result), table) + # Don't check stacklevel as PyArrow calls the deprecated `__dataframe__` method. + with tm.assert_produces_warning(match="Interchange", check_stacklevel=False): + assert pa.Table.equals(pa.interchange.from_dataframe(result), table) @pytest.mark.parametrize( @@ -110,12 +117,15 @@ def test_bitmasks_pyarrow(offset, length, expected_values): arr = [3.3, None, 2.1] table = pa.table({"arr": arr}).slice(offset, length) exchange_df = table.__dataframe__() - result = from_dataframe(exchange_df) + with tm.assert_produces_warning(match="Interchange"): + result = from_dataframe(exchange_df) expected = pd.DataFrame({"arr": expected_values}) tm.assert_frame_equal(result, expected) # check round-trip - assert pa.Table.equals(pa.interchange.from_dataframe(result), table) + # Don't check stacklevel as PyArrow calls the deprecated `__dataframe__` method. + with tm.assert_produces_warning(match="Interchange", check_stacklevel=False): + assert pa.Table.equals(pa.interchange.from_dataframe(result), table) @pytest.mark.parametrize( @@ -140,7 +150,8 @@ def test_dataframe(data): } df = pd.DataFrame(data) - df2 = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + df2 = df.__dataframe__() assert df2.num_columns() == NCOLS assert df2.num_rows() == NROWS @@ -150,8 +161,9 @@ def test_dataframe(data): indices = (0, 2) names = tuple(list(data.keys())[idx] for idx in indices) - result = from_dataframe(df2.select_columns(indices)) - expected = from_dataframe(df2.select_columns_by_name(names)) + with tm.assert_produces_warning(match="Interchange"): + result = from_dataframe(df2.select_columns(indices)) + expected = from_dataframe(df2.select_columns_by_name(names)) tm.assert_frame_equal(result, expected) assert isinstance(result.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list) @@ -175,7 +187,8 @@ def test_missing_from_masked(): ] df.loc[null_idx, col] = None - df2 = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + df2 = df.__dataframe__() assert df2.get_column_by_name("x").null_count == dict_null["x"] assert df2.get_column_by_name("y").null_count == dict_null["y"] @@ -196,7 +209,8 @@ def test_missing_from_masked(): ) def test_mixed_data(data): df = pd.DataFrame(data) - df2 = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + df2 = df.__dataframe__() for col_name in df.columns: assert df2.get_column_by_name(col_name).null_count == 0 @@ -211,7 +225,8 @@ def test_mixed_missing(): } ) - df2 = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + df2 = df.__dataframe__() for col_name in df.columns: assert df2.get_column_by_name(col_name).null_count == 2 @@ -229,7 +244,8 @@ def test_string(): } test_str_data = string_data["separator data"] + [""] df = pd.DataFrame({"A": test_str_data}) - col = df.__dataframe__().get_column_by_name("A") + with tm.assert_produces_warning(match="Interchange"): + col = df.__dataframe__().get_column_by_name("A") assert col.size() == 6 assert col.null_count == 1 @@ -237,7 +253,8 @@ def test_string(): assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) df_sliced = df[1:] - col = df_sliced.__dataframe__().get_column_by_name("A") + with tm.assert_produces_warning(match="Interchange"): + col = df_sliced.__dataframe__().get_column_by_name("A") assert col.size() == 5 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.STRING @@ -246,27 +263,31 @@ def test_string(): def test_nonstring_object(): df = pd.DataFrame({"A": ["a", 10, 1.0, ()]}) - col = df.__dataframe__().get_column_by_name("A") + with tm.assert_produces_warning(match="Interchange"): + col = df.__dataframe__().get_column_by_name("A") with pytest.raises(NotImplementedError, match="not supported yet"): col.dtype def test_datetime(): df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]}) - col = df.__dataframe__().get_column_by_name("A") + with tm.assert_produces_warning(match="Interchange"): + col = df.__dataframe__().get_column_by_name("A") assert col.size() == 2 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.DATETIME assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT) - tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) + with tm.assert_produces_warning(match="Interchange"): + tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) def test_categorical_to_numpy_dlpack(): # https://github.com/pandas-dev/pandas/issues/48393 df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])}) - col = df.__dataframe__().get_column_by_name("A") + with tm.assert_produces_warning(match="Interchange"): + col = df.__dataframe__().get_column_by_name("A") result = np.from_dlpack(col.get_buffers()["data"][0]) expected = np.array([0, 1, 0], dtype="int8") tm.assert_numpy_array_equal(result, expected) @@ -275,11 +296,13 @@ def test_categorical_to_numpy_dlpack(): @pytest.mark.parametrize("data", [{}, {"a": []}]) def test_empty_pyarrow(data): # GH 53155 - pytest.importorskip("pyarrow", "11.0.0") + pytest.importorskip("pyarrow", "14.0.0") from pyarrow.interchange import from_dataframe as pa_from_dataframe expected = pd.DataFrame(data) - arrow_df = pa_from_dataframe(expected) + # Don't check stacklevel as PyArrow calls the deprecated `__dataframe__` method. + with tm.assert_produces_warning(match="Interchange", check_stacklevel=False): + arrow_df = pa_from_dataframe(expected) result = from_dataframe(arrow_df) tm.assert_frame_equal(result, expected, check_column_type=False) @@ -301,11 +324,15 @@ def test_multi_chunk_column() -> None: ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]") df = pd.concat([ser, ser], ignore_index=True).to_frame("a") df_orig = df.copy() - with pytest.raises( - RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False" - ): - pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False)) - result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True)) + + with tm.assert_produces_warning(match="Interchange"): + with pytest.raises( + RuntimeError, + match="Found multi-chunk pyarrow array, but `allow_copy` is False", + ): + pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False)) + with tm.assert_produces_warning(match="Interchange"): + result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True)) # Interchange protocol defaults to creating numpy-backed columns, so currently this # is 'float64'. expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64") @@ -334,8 +361,9 @@ def test_timestamp_ns_pyarrow(): name="col0", ).to_frame() - dfi = df.__dataframe__() - result = pd.api.interchange.from_dataframe(dfi)["col0"].item() + with tm.assert_produces_warning(match="Interchange"): + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi)["col0"].item() expected = pd.Timestamp(**timestamp_args) assert result == expected @@ -348,7 +376,8 @@ def test_datetimetzdtype(tz, unit): pd.date_range("2018-01-01", periods=5, freq="D").tz_localize(tz).as_unit(unit) ) df = pd.DataFrame({"ts_tz": tz_data}) - tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) + with tm.assert_produces_warning(match="Interchange"): + tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) def test_interchange_from_non_pandas_tz_aware(request): @@ -370,7 +399,8 @@ def test_interchange_from_non_pandas_tz_aware(request): arr = pc.assume_timezone(arr, "Asia/Kathmandu") table = pa.table({"arr": arr}) exchange_df = table.__dataframe__() - result = from_dataframe(exchange_df) + with tm.assert_produces_warning(match="Interchange"): + result = from_dataframe(exchange_df) expected = pd.DataFrame( ["2020-01-01 00:00:00+05:45", "NaT", "2020-01-02 00:00:00+05:45"], @@ -382,8 +412,9 @@ def test_interchange_from_non_pandas_tz_aware(request): def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: # https://github.com/pandas-dev/pandas/issues/54781 - df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__() - interchange = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__() + interchange = df.__dataframe__() column = interchange.get_column_by_name("a") buffers = column.get_buffers() buffers_data = buffers["data"] @@ -398,14 +429,16 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: column.get_buffers = lambda: buffers interchange.get_column_by_name = lambda _: column monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) - pd.api.interchange.from_dataframe(df) + with tm.assert_produces_warning(match="Interchange"): + pd.api.interchange.from_dataframe(df) def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) - df2 = df.__dataframe__() - result = pd.api.interchange.from_dataframe(df2) + with tm.assert_produces_warning(match="Interchange"): + df2 = df.__dataframe__() + result = pd.api.interchange.from_dataframe(df2) tm.assert_frame_equal(df, result) @@ -413,7 +446,9 @@ def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") - result = pd.api.interchange.from_dataframe(df.__dataframe__()) + # Don't check stacklevel as PyArrow calls the deprecated `__dataframe__` method. + with tm.assert_produces_warning(match="Interchange", check_stacklevel=False): + result = pd.api.interchange.from_dataframe(df.__dataframe__()) expected = pd.DataFrame({"a": ["x"]}, dtype="str") tm.assert_frame_equal(result, expected) @@ -421,25 +456,28 @@ def test_large_string(): def test_non_str_names(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.Series([1, 2, 3], name=0).to_frame() - names = df.__dataframe__().column_names() + with tm.assert_produces_warning(match="Interchange"): + names = df.__dataframe__().column_names() assert names == ["0"] def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) - dfi = df.__dataframe__() - with pytest.raises( - TypeError, - match=( - "Expected a Series, got a DataFrame. This likely happened because you " - "called __dataframe__ on a DataFrame which, after converting column " - r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " - r"dtype='(str|object)'\). Please rename these columns before using the " - "interchange protocol." - ), - ): - pd.api.interchange.from_dataframe(dfi, allow_copy=False) + with tm.assert_produces_warning(match="Interchange"): + dfi = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + with pytest.raises( + TypeError, + match=( + "Expected a Series, got a DataFrame. This likely happened because you " + "called __dataframe__ on a DataFrame which, after converting column " + r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " + r"dtype='(str|object)'\). Please rename these columns before using the " + "interchange protocol." + ), + ): + pd.api.interchange.from_dataframe(dfi, allow_copy=False) @pytest.mark.parametrize( @@ -498,7 +536,8 @@ def test_pandas_nullable_with_missing_values( expected_dtype = pa.timestamp("us", "Asia/Kathmandu") df = pd.DataFrame({"a": data}, dtype=dtype) - result = pai.from_dataframe(df.__dataframe__())["a"] + with tm.assert_produces_warning(match="Interchange"): + result = pai.from_dataframe(df.__dataframe__())["a"] assert result.type == expected_dtype assert result[0].as_py() == data[0] assert result[1].as_py() == data[1] @@ -564,7 +603,8 @@ def test_pandas_nullable_without_missing_values( expected_dtype = pa.timestamp("us", "Asia/Kathmandu") df = pd.DataFrame({"a": data}, dtype=dtype) - result = pai.from_dataframe(df.__dataframe__())["a"] + with tm.assert_produces_warning(match="Interchange"): + result = pai.from_dataframe(df.__dataframe__())["a"] assert result.type == expected_dtype assert result[0].as_py() == data[0] assert result[1].as_py() == data[1] @@ -575,7 +615,8 @@ def test_string_validity_buffer() -> None: # https://github.com/pandas-dev/pandas/issues/57761 pytest.importorskip("pyarrow", "11.0.0") df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") - result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + with tm.assert_produces_warning(match="Interchange"): + result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] assert result is None @@ -583,7 +624,8 @@ def test_string_validity_buffer_no_missing() -> None: # https://github.com/pandas-dev/pandas/issues/57762 pytest.importorskip("pyarrow", "11.0.0") df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]") - validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + with tm.assert_produces_warning(match="Interchange"): + validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] assert validity is not None result = validity[1] expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=") @@ -593,8 +635,9 @@ def test_string_validity_buffer_no_missing() -> None: def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") - dfi = df.__dataframe__() - result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) + with tm.assert_produces_warning(match="Interchange"): + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) expected = pd.DataFrame({"a": []}, dtype="int8") tm.assert_frame_equal(result, expected) @@ -639,7 +682,8 @@ def test_buffer_dtype_categorical( ) -> None: # https://github.com/pandas-dev/pandas/issues/54781 df = pd.DataFrame({"data": data}) - dfi = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfi = df.__dataframe__() col = dfi.get_column_by_name("data") assert col.dtype == expected_dtype assert col.get_buffers()["data"][1] == expected_buffer_dtype diff --git a/pandas/tests/interchange/test_spec_conformance.py b/pandas/tests/interchange/test_spec_conformance.py index 55e42ed2023cd..04e19b290f886 100644 --- a/pandas/tests/interchange/test_spec_conformance.py +++ b/pandas/tests/interchange/test_spec_conformance.py @@ -9,6 +9,7 @@ import pytest import pandas as pd +import pandas._testing as tm @pytest.fixture @@ -32,7 +33,8 @@ def maker(dct, is_categorical=False): def test_only_one_dtype(test_data, df_from_dict): columns = list(test_data.keys()) df = df_from_dict(test_data) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() column_size = len(test_data[columns[0]]) for column in columns: @@ -54,7 +56,8 @@ def test_mixed_dtypes(df_from_dict): "f": ["a", "", "c"], # dtype kind STRING = 21 } ) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() # for meanings of dtype[0] see the spec; we cannot import the spec here as this # file is expected to be vendored *anywhere*; # values for dtype[0] are explained above @@ -74,7 +77,8 @@ def test_mixed_dtypes(df_from_dict): def test_na_float(df_from_dict): df = df_from_dict({"a": [1.0, math.nan, 2.0]}) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() colX = dfX.get_column_by_name("a") assert colX.null_count == 1 assert isinstance(colX.null_count, int) @@ -82,7 +86,8 @@ def test_na_float(df_from_dict): def test_noncategorical(df_from_dict): df = df_from_dict({"a": [1, 2, 3]}) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() colX = dfX.get_column_by_name("a") with pytest.raises(TypeError, match=".*categorical.*"): colX.describe_categorical @@ -94,7 +99,8 @@ def test_categorical(df_from_dict): is_categorical=True, ) - colX = df.__dataframe__().get_column_by_name("weekday") + with tm.assert_produces_warning(match="Interchange"): + colX = df.__dataframe__().get_column_by_name("weekday") categorical = colX.describe_categorical assert isinstance(categorical["is_ordered"], bool) assert isinstance(categorical["is_dictionary"], bool) @@ -104,7 +110,8 @@ def test_dataframe(df_from_dict): df = df_from_dict( {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]} ) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() assert dfX.num_columns() == 3 assert dfX.num_rows() == 3 @@ -118,7 +125,8 @@ def test_dataframe(df_from_dict): @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) def test_df_get_chunks(size, n_chunks, df_from_dict): df = df_from_dict({"x": list(range(size))}) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() chunks = list(dfX.get_chunks(n_chunks)) assert len(chunks) == n_chunks assert sum(chunk.num_rows() for chunk in chunks) == size @@ -127,7 +135,8 @@ def test_df_get_chunks(size, n_chunks, df_from_dict): @pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) def test_column_get_chunks(size, n_chunks, df_from_dict): df = df_from_dict({"x": list(range(size))}) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() chunks = list(dfX.get_column(0).get_chunks(n_chunks)) assert len(chunks) == n_chunks assert sum(chunk.size() for chunk in chunks) == size @@ -135,7 +144,8 @@ def test_column_get_chunks(size, n_chunks, df_from_dict): def test_get_columns(df_from_dict): df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() for colX in dfX.get_columns(): assert colX.size() == 2 assert colX.num_chunks() == 1 @@ -148,7 +158,8 @@ def test_get_columns(df_from_dict): def test_buffer(df_from_dict): arr = [0, 1, -1] df = df_from_dict({"a": arr}) - dfX = df.__dataframe__() + with tm.assert_produces_warning(match="Interchange"): + dfX = df.__dataframe__() colX = dfX.get_column(0) bufX = colX.get_buffers() diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index cef57318195ec..c88489fcdd229 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -97,25 +97,25 @@ def test_nonexistent_path(all_parsers): @pytest.mark.skipif(WASM, reason="limited file system access on WASM") @td.skip_if_windows # os.chmod does not work in windows -def test_no_permission(all_parsers): +def test_no_permission(all_parsers, temp_file): # GH 23784 parser = all_parsers msg = r"\[Errno 13\]" - with tm.ensure_clean() as path: - os.chmod(path, 0) # make file unreadable + path = temp_file + os.chmod(path, 0) # make file unreadable - # verify that this process cannot open the file (not running as sudo) - try: - with open(path, encoding="utf-8"): - pass - pytest.skip("Running as sudo.") - except PermissionError: + # verify that this process cannot open the file (not running as sudo) + try: + with open(path, encoding="utf-8"): pass + pytest.skip("Running as sudo.") + except PermissionError: + pass - with pytest.raises(PermissionError, match=msg) as e: - parser.read_csv(path) - assert path == e.value.filename + with pytest.raises(PermissionError, match=msg) as e: + parser.read_csv(path) + assert str(path.resolve()) == e.value.filename @pytest.mark.parametrize( @@ -269,19 +269,19 @@ def test_internal_eof_byte(all_parsers): tm.assert_frame_equal(result, expected) -def test_internal_eof_byte_to_file(all_parsers): +def test_internal_eof_byte_to_file(all_parsers, temp_file): # see gh-16559 parser = all_parsers data = b'c1,c2\r\n"test \x1a test", test\r\n' expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) path = f"__{uuid.uuid4()}__.csv" - with tm.ensure_clean(path) as path: - with open(path, "wb") as f: - f.write(data) + path2 = temp_file.parent / path + with open(path2, "wb") as f: + f.write(data) - result = parser.read_csv(path) - tm.assert_frame_equal(result, expected) + result = parser.read_csv(path2) + tm.assert_frame_equal(result, expected) def test_file_handle_string_io(all_parsers): @@ -372,7 +372,7 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding): assert not handle.closed -def test_memory_map_compression(all_parsers, compression): +def test_memory_map_compression(all_parsers, compression, temp_file): """ Support memory map for compressed files. @@ -381,16 +381,16 @@ def test_memory_map_compression(all_parsers, compression): parser = all_parsers expected = DataFrame({"a": [1], "b": [2]}) - with tm.ensure_clean() as path: - expected.to_csv(path, index=False, compression=compression) + path = temp_file + expected.to_csv(path, index=False, compression=compression) - if parser.engine == "pyarrow": - msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - parser.read_csv(path, memory_map=True, compression=compression) - return + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, memory_map=True, compression=compression) + return - result = parser.read_csv(path, memory_map=True, compression=compression) + result = parser.read_csv(path, memory_map=True, compression=compression) tm.assert_frame_equal( result, @@ -442,12 +442,12 @@ def test_context_manageri_user_provided(all_parsers, datapath): @skip_pyarrow # ParserError: Empty CSV file -def test_file_descriptor_leak(all_parsers): +def test_file_descriptor_leak(all_parsers, temp_file): # GH 31488 parser = all_parsers - with tm.ensure_clean() as path: - with pytest.raises(EmptyDataError, match="No columns to parse from file"): - parser.read_csv(path) + path = temp_file + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + parser.read_csv(path) def test_memory_map(all_parsers, csv_dir_path): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index ab88d221864c0..3cd7f6c336956 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2171,6 +2171,17 @@ def test_arrow_timestamp_resample_keep_index_name(): tm.assert_series_equal(result, expected) +def test_resample_unit_second_large_years(): + # GH#57427 + index = DatetimeIndex( + date_range(start=Timestamp("1950-01-01"), periods=10, freq="1000YS", unit="s") + ) + ser = Series(1, index=index) + result = ser.resample("2000YS").sum() + expected = Series(2, index=index[::2]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("freq", ["1A", "2A-MAR"]) def test_resample_A_raises(freq): msg = f"Invalid frequency: {freq[1:]}" diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 51d6704e1905b..d39db924c6773 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -2,6 +2,8 @@ import numpy as np +from pandas.errors import Pandas4Warning + import pandas as pd from pandas import ( Period, @@ -75,9 +77,14 @@ def test_combine_first_dt64(self, unit): xp = to_datetime(Series(["2010", "2011"])).dt.as_unit(unit) tm.assert_series_equal(rs, xp) + def test_combine_first_dt64_casting_deprecation(self, unit): + # GH#62931 s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit) s1 = Series([np.nan, "2011"]) - rs = s0.combine_first(s1) + + msg = "Silently casting non-datetime 'other' to datetime" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + rs = s0.combine_first(s1) xp = Series([datetime(2010, 1, 1), "2011"], dtype=f"datetime64[{unit}]") @@ -144,3 +151,12 @@ def test_combine_mixed_timezone(self): ), ) tm.assert_series_equal(result, expected) + + def test_combine_first_none_not_nan(self): + # GH#58977 + s1 = Series([None, None, None], index=["a", "b", "c"]) + s2 = Series([None, None, None], index=["b", "c", "d"]) + + result = s1.combine_first(s2) + expected = Series([None] * 4, index=["a", "b", "c", "d"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index dad5c73b89626..05e1d50a86d3c 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -13,6 +13,7 @@ from pandas._libs.tslibs.offsets import _get_offset from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG from pandas.compat import is_platform_windows +import pandas.util._test_decorators as td from pandas import ( DatetimeIndex, @@ -542,3 +543,16 @@ def test_infer_freq_non_nano_tzaware(tz_aware_fixture): res = frequencies.infer_freq(dta) assert res == "B" + + +@td.skip_if_no("pyarrow") +def test_infer_freq_pyarrow(): + # GH#58403 + data = ["2022-01-01T10:00:00", "2022-01-01T10:00:30", "2022-01-01T10:01:00"] + pd_series = Series(data).astype("timestamp[s][pyarrow]") + pd_index = Index(data).astype("timestamp[s][pyarrow]") + + assert frequencies.infer_freq(pd_index.values) == "30s" + assert frequencies.infer_freq(pd_series.values) == "30s" + assert frequencies.infer_freq(pd_index) == "30s" + assert frequencies.infer_freq(pd_series) == "30s" diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 4003221a06f6a..c4e6733b9a08d 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -37,6 +37,7 @@ from pandas.core.dtypes.common import is_numeric_dtype from pandas.core.dtypes.dtypes import ( + ArrowDtype, DatetimeTZDtype, PeriodDtype, ) @@ -132,6 +133,14 @@ def infer_freq( if isinstance(index, ABCSeries): values = index._values + + if isinstance(index.dtype, ArrowDtype): + import pyarrow as pa + + if pa.types.is_timestamp(values.dtype.pyarrow_dtype): + # GH#58403 + values = values._to_datetimearray() + if not ( lib.is_np_dtype(values.dtype, "mM") or isinstance(values.dtype, DatetimeTZDtype) diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 4cb4ad0f48c5b..76e4494850182 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -19,6 +19,7 @@ main: - meta - footnotes - codehilite + - admonition static: logo: static/img/pandas_white.svg css: diff --git a/web/pandas/pdeps/0010-required-pyarrow-dependency.md b/web/pandas/pdeps/0010-required-pyarrow-dependency.md index 60ed8c4b910eb..29e6f7f0e98f3 100644 --- a/web/pandas/pdeps/0010-required-pyarrow-dependency.md +++ b/web/pandas/pdeps/0010-required-pyarrow-dependency.md @@ -10,6 +10,13 @@ [TOC] +!!! note + While this PDEP mentions adding pyarrow as a required dependency in + pandas 3.0, this aspect has been delayed until after pandas 3.0 (see the + abstract of [PDEP-14](https://pandas.pydata.org/pdeps/0014-string-dtype.html)). + Therefore, pandas 3.0 will *not* have a hard requirement on pyarrow but still use + pyarrow by default (for the new string dtype) when installed. + ## Abstract This PDEP proposes that: diff --git a/web/pandas/static/css/pandas.css b/web/pandas/static/css/pandas.css index 59904606040be..ef354d8264da4 100644 --- a/web/pandas/static/css/pandas.css +++ b/web/pandas/static/css/pandas.css @@ -135,3 +135,66 @@ h2:hover a.headerlink, h3:hover a.headerlink { opacity: 1; transition: opacity 0.5s; } + + +/** Copied from the pydata-sphinx-theme **/ + div.admonition, .admonition { + margin: 1.5625em auto; + padding: 0 0.6rem 0.8rem; + overflow: hidden; + box-shadow: 0 .2rem .5rem rgba(0,0,0,0.1),0 0 .0625rem rgba(0,0,0,0.1) !important; + /* break-inside has replaced page-break-inside and is widely usable since 2019 */ + page-break-inside: avoid; + break-inside: avoid; + border-left: 0.2rem solid; + border-color: #276be9; + border-radius: 0.25rem; + background-color: #fff; + /** * Special-case for a `sidebar` class that makes the admonition float to * the right like the { + sidebar + } + directive. */ +} + div.admonition *:last-child, .admonition *:last-child { + margin-bottom: 0; +} + div.admonition p.admonition-title ~ *, .admonition p.admonition-title ~ * { + margin-left: 1.4rem; + margin-right: 1.4rem; +} + div.admonition > ol, .admonition > ol, div.admonition > ul, .admonition > ul { + margin-left: 1em; +} + div.admonition > .admonition-title, .admonition > .admonition-title { + margin: 0 -0.6rem; + padding: 0.4rem 0.6rem 0.4rem 2rem; + /* font-weight: var(--pst-admonition-font-weight-heading); */ + position: relative; + background-color: #dce7fc; + z-index: 1; +} + div.admonition > .admonition-title::after, .admonition > .admonition-title::after { + position: absolute; + left: 0.5rem; + width: 1rem; + height: 1rem; + /* color: var(--pst-color-info); */ + /* font: var(--fa-font-solid); */ + line-height: inherit; + /* content: "\f05a"; */ + opacity: 1; +} + div.admonition > .admonition-title + *, .admonition > .admonition-title + * { + margin-top: 0.4em; +} + + div.admonition.note, .admonition.note { + border-color: #276be9; +} + div.admonition.note > .admonition-title, .admonition.note > .admonition-title { + background-color: #dce7fc; +} + div.admonition.note > .admonition-title::after, .admonition.note > .admonition-title::after { + color: #276be9; + /* content: "\f05a"; */ +} From c997161f4b2b11df5259e25dffaa4382d363bd66 Mon Sep 17 00:00:00 2001 From: Li Date: Tue, 4 Nov 2025 02:21:39 +0000 Subject: [PATCH 3/5] address PR comment: construct DaataFrame in the body of test --- pandas/tests/indexing/test_loc.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 1a57b89f6fe29..8a1eba809a2ec 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -259,20 +259,21 @@ def test_loc_getitem_single_boolean_arg(self, obj, key, exp): assert res == exp @pytest.mark.parametrize( - "obj", + "data", [ - DataFrame({"A": [datetime(2025, 10, 30)]}), - DataFrame({"A": [Timestamp(2025, 10, 30)] * 2}), - DataFrame({"A": [Timedelta(1)]}), - DataFrame({"A": [Timedelta(1), Timedelta(2)]}), + [datetime(2025, 10, 30)], + [Timestamp(2025, 10, 30)] * 2, + [Timedelta(1)], + [Timedelta(1), Timedelta(2)], ], ) - def test_loc_empty_slice_assignment_with_datetime(self, obj): + def test_loc_empty_slice_assignment_with_datetime(self, data): # issue #50942 # empty slice assignment with datetime or timedelta should not raise exceptions - mask = [False] * len(obj) + mask = [False] * len(data) try: - obj.loc[mask] = obj + df = DataFrame(data=data, columns=["A"]) + df.loc[mask] = df assert True except Exception: pytest.fail("loc empty slice assignment raised Exception unexpectedly!") From 8d920f12a499dae2825d6c9e4a8eb481bb60ba27 Mon Sep 17 00:00:00 2001 From: Li Date: Tue, 4 Nov 2025 02:34:52 +0000 Subject: [PATCH 4/5] address PR comment: add assert_frame_equal --- pandas/tests/indexing/test_loc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 8a1eba809a2ec..b2634f3a30fe3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -273,8 +273,9 @@ def test_loc_empty_slice_assignment_with_datetime(self, data): mask = [False] * len(data) try: df = DataFrame(data=data, columns=["A"]) - df.loc[mask] = df - assert True + res = df.loc[mask] + res = df + tm.assert_frame_equal(res, df) except Exception: pytest.fail("loc empty slice assignment raised Exception unexpectedly!") From 4aedae4f8733a5ea439d69b1a60512c0ee0f8f71 Mon Sep 17 00:00:00 2001 From: Li Date: Fri, 7 Nov 2025 04:48:43 +0000 Subject: [PATCH 5/5] address PR comment: assert df equals to expected --- pandas/tests/indexing/test_loc.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b2634f3a30fe3..8d59b0c026e0c 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -271,13 +271,11 @@ def test_loc_empty_slice_assignment_with_datetime(self, data): # issue #50942 # empty slice assignment with datetime or timedelta should not raise exceptions mask = [False] * len(data) - try: - df = DataFrame(data=data, columns=["A"]) - res = df.loc[mask] - res = df - tm.assert_frame_equal(res, df) - except Exception: - pytest.fail("loc empty slice assignment raised Exception unexpectedly!") + + df = DataFrame(data=data, columns=["A"]) + expected = df.copy() + df.loc[mask] = df + tm.assert_frame_equal(df, expected) class TestLocBaseIndependent: