diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst index e93514de5f762..a76e51ace86d2 100644 --- a/doc/source/reference/general_functions.rst +++ b/doc/source/reference/general_functions.rst @@ -71,6 +71,7 @@ Top-level evaluation .. autosummary:: :toctree: api/ + col eval Datetime formats diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 89981786d60b5..919dafb291b86 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -553,6 +553,12 @@ a function of one argument to be evaluated on the DataFrame being assigned to. iris.assign(sepal_ratio=lambda x: (x["SepalWidth"] / x["SepalLength"])).head() +or, using :meth:`pandas.col`: + +.. ipython:: python + + iris.assign(sepal_ratio=pd.col("SepalWidth") / pd.col("SepalLength")).head() + :meth:`~pandas.DataFrame.assign` **always** returns a copy of the data, leaving the original DataFrame untouched. diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b94d82f3c9783..372e93b216e26 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -117,10 +117,28 @@ process in more detail. `PDEP-7: Consistent copy/view semantics in pandas with Copy-on-Write `__ -.. _whatsnew_300.enhancements.enhancement2: +.. _whatsnew_300.enhancements.col: -Enhancement2 -^^^^^^^^^^^^ +``pd.col`` syntax can now be used in :meth:`DataFrame.assign` and :meth:`DataFrame.loc` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can now use ``pd.col`` to create callables for use in dataframe methods which accept them. For example, if you have a dataframe + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 1, 2], 'b': [4, 5, 6]}) + +and you want to create a new column ``'c'`` by summing ``'a'`` and ``'b'``, then instead of + +.. ipython:: python + + df.assign(c = lambda df: df['a'] + df['b']) + +you can now write: + +.. ipython:: python + + df.assign(c = pd.col('a') + pd.col('b')) New Deprecation Policy ^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/__init__.py b/pandas/__init__.py index 8b92ad6cdfebb..cc786d1141c48 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -105,6 +105,7 @@ Series, DataFrame, ) +from pandas.core.col import col from pandas.core.dtypes.dtypes import SparseDtype @@ -281,6 +282,7 @@ "array", "arrays", "bdate_range", + "col", "concat", "crosstab", "cut", diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index c1178c72f3edc..de6657b58ee80 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -6,6 +6,7 @@ from pandas._libs.lib import NoDefault from pandas._libs.missing import NAType +from pandas.core.col import Expression from pandas.core.groupby import ( DataFrameGroupBy, SeriesGroupBy, @@ -41,6 +42,7 @@ "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", + "Expression", "FrozenList", "JsonReader", "NAType", diff --git a/pandas/core/col.py b/pandas/core/col.py new file mode 100644 index 0000000000000..eec1d241df92d --- /dev/null +++ b/pandas/core/col.py @@ -0,0 +1,283 @@ +from __future__ import annotations + +from collections.abc import ( + Callable, + Hashable, +) +from typing import ( + TYPE_CHECKING, + Any, +) + +from pandas.core.series import Series + +if TYPE_CHECKING: + from pandas import DataFrame + + +# Used only for generating the str repr of expressions. +_OP_SYMBOLS = { + "__add__": "+", + "__radd__": "+", + "__sub__": "-", + "__rsub__": "-", + "__mul__": "*", + "__rmul__": "*", + "__truediv__": "/", + "__rtruediv__": "/", + "__floordiv__": "//", + "__rfloordiv__": "//", + "__mod__": "%", + "__rmod__": "%", + "__ge__": ">=", + "__gt__": ">", + "__le__": "<=", + "__lt__": "<", + "__eq__": "==", + "__ne__": "!=", +} + + +def _parse_args(df: DataFrame, *args: Any) -> tuple[Series]: + # Parse `args`, evaluating any expressions we encounter. + return tuple([x(df) if isinstance(x, Expression) else x for x in args]) + + +def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[str, Any]: + # Parse `kwargs`, evaluating any expressions we encounter. + return { + key: val(df) if isinstance(val, Expression) else val + for key, val in kwargs.items() + } + + +def _pretty_print_args_kwargs(*args: Any, **kwargs: Any) -> str: + inputs_repr = ", ".join( + arg._repr_str if isinstance(arg, Expression) else repr(arg) for arg in args + ) + kwargs_repr = ", ".join( + f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" + for k, v in kwargs.items() + ) + + all_args = [] + if inputs_repr: + all_args.append(inputs_repr) + if kwargs_repr: + all_args.append(kwargs_repr) + + return ", ".join(all_args) + + +class Expression: + """ + Class representing a deferred column. + + This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`. + """ + + def __init__(self, func: Callable[[DataFrame], Any], repr_str: str) -> None: + self._func = func + self._repr_str = repr_str + + def __call__(self, df: DataFrame) -> Any: + return self._func(df) + + def _with_binary_op(self, op: str, other: Any) -> Expression: + op_symbol = _OP_SYMBOLS.get(op, op) + + if isinstance(other, Expression): + if op.startswith("__r"): + repr_str = f"({other._repr_str} {op_symbol} {self._repr_str})" + else: + repr_str = f"({self._repr_str} {op_symbol} {other._repr_str})" + return Expression(lambda df: getattr(self(df), op)(other(df)), repr_str) + else: + if op.startswith("__r"): + repr_str = f"({other!r} {op_symbol} {self._repr_str})" + else: + repr_str = f"({self._repr_str} {op_symbol} {other!r})" + return Expression(lambda df: getattr(self(df), op)(other), repr_str) + + # Binary ops + def __add__(self, other: Any) -> Expression: + return self._with_binary_op("__add__", other) + + def __radd__(self, other: Any) -> Expression: + return self._with_binary_op("__radd__", other) + + def __sub__(self, other: Any) -> Expression: + return self._with_binary_op("__sub__", other) + + def __rsub__(self, other: Any) -> Expression: + return self._with_binary_op("__rsub__", other) + + def __mul__(self, other: Any) -> Expression: + return self._with_binary_op("__mul__", other) + + def __rmul__(self, other: Any) -> Expression: + return self._with_binary_op("__rmul__", other) + + def __truediv__(self, other: Any) -> Expression: + return self._with_binary_op("__truediv__", other) + + def __rtruediv__(self, other: Any) -> Expression: + return self._with_binary_op("__rtruediv__", other) + + def __floordiv__(self, other: Any) -> Expression: + return self._with_binary_op("__floordiv__", other) + + def __rfloordiv__(self, other: Any) -> Expression: + return self._with_binary_op("__rfloordiv__", other) + + def __ge__(self, other: Any) -> Expression: + return self._with_binary_op("__ge__", other) + + def __gt__(self, other: Any) -> Expression: + return self._with_binary_op("__gt__", other) + + def __le__(self, other: Any) -> Expression: + return self._with_binary_op("__le__", other) + + def __lt__(self, other: Any) -> Expression: + return self._with_binary_op("__lt__", other) + + def __eq__(self, other: object) -> Expression: # type: ignore[override] + return self._with_binary_op("__eq__", other) + + def __ne__(self, other: object) -> Expression: # type: ignore[override] + return self._with_binary_op("__ne__", other) + + def __mod__(self, other: Any) -> Expression: + return self._with_binary_op("__mod__", other) + + def __rmod__(self, other: Any) -> Expression: + return self._with_binary_op("__rmod__", other) + + def __array_ufunc__( + self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any + ) -> Expression: + def func(df: DataFrame) -> Any: + parsed_inputs = _parse_args(df, *inputs) + parsed_kwargs = _parse_kwargs(df, *kwargs) + return ufunc(*parsed_inputs, **parsed_kwargs) + + args_str = _pretty_print_args_kwargs(*inputs, **kwargs) + repr_str = f"{ufunc.__name__}({args_str})" + + return Expression(func, repr_str) + + # Everything else + def __getattr__(self, attr: str, /) -> Any: + if attr in Series._accessors: + return NamespaceExpression(self, attr) + + def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: + parsed_args = _parse_args(df, *args) + parsed_kwargs = _parse_kwargs(df, **kwargs) + return getattr(self(df), attr)(*parsed_args, **parsed_kwargs) + + def wrapper(*args: Any, **kwargs: Any) -> Expression: + args_str = _pretty_print_args_kwargs(*args, **kwargs) + repr_str = f"{self._repr_str}.{attr}({args_str})" + + return Expression(lambda df: func(df, *args, **kwargs), repr_str) + + return wrapper + + def __repr__(self) -> str: + return self._repr_str or "Expr(...)" + + +class NamespaceExpression: + def __init__(self, func: Expression, namespace: str) -> None: + self._func = func + self._namespace = namespace + + def __call__(self, df: DataFrame) -> Any: + return self._func(df) + + def __getattr__(self, attr: str) -> Any: + if isinstance(getattr(getattr(Series, self._namespace), attr), property): + repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}" + return Expression( + lambda df: getattr(getattr(self(df), self._namespace), attr), + repr_str, + ) + + def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: + parsed_args = _parse_args(df, *args) + parsed_kwargs = _parse_kwargs(df, **kwargs) + return getattr(getattr(self(df), self._namespace), attr)( + *parsed_args, **parsed_kwargs + ) + + def wrapper(*args: Any, **kwargs: Any) -> Expression: + args_str = _pretty_print_args_kwargs(*args, **kwargs) + repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}({args_str})" + return Expression(lambda df: func(df, *args, **kwargs), repr_str) + + return wrapper + + +def col(col_name: Hashable) -> Expression: + """ + Generate deferred object representing a column of a DataFrame. + + Any place which accepts ``lambda df: df[col_name]``, such as + :meth:`DataFrame.assign` or :meth:`DataFrame.loc`, can also accept + ``pd.col(col_name)``. + + Parameters + ---------- + col_name : Hashable + Column name. + + Returns + ------- + `pandas.api.typing.Expression` + A deferred object representing a column of a DataFrame. + + See Also + -------- + DataFrame.query : Query columns of a dataframe using string expressions. + + Examples + -------- + + You can use `col` in `assign`. + + >>> df = pd.DataFrame({"name": ["beluga", "narwhal"], "speed": [100, 110]}) + >>> df.assign(name_titlecase=pd.col("name").str.title()) + name speed name_titlecase + 0 beluga 100 Beluga + 1 narwhal 110 Narwhal + + You can also use it for filtering. + + >>> df.loc[pd.col("speed") > 105] + name speed + 1 narwhal 110 + """ + if not isinstance(col_name, Hashable): + msg = f"Expected Hashable, got: {type(col_name)}" + raise TypeError(msg) + + def func(df: DataFrame) -> Series: + if col_name not in df.columns: + columns_str = str(df.columns.tolist()) + max_len = 90 + if len(columns_str) > max_len: + columns_str = columns_str[:max_len] + "...]" + + msg = ( + f"Column '{col_name}' not found in given DataFrame.\n\n" + f"Hint: did you mean one of {columns_str} instead?" + ) + raise ValueError(msg) + return df[col_name] + + return Expression(func, f"col({col_name!r})") + + +__all__ = ["Expression", "col"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec8c8116e5aee..b95dba1694ca0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5304,6 +5304,13 @@ def assign(self, **kwargs) -> DataFrame: Portland 17.0 62.6 Berkeley 25.0 77.0 + or by using :meth:`pandas.col`: + + >>> df.assign(temp_f=pd.col("temp_c") * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + You can create multiple columns within the same assign where one of the columns depends on another one defined within the same assign: diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c2e77b69aadcb..2c26f77102df1 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -107,6 +107,7 @@ class TestPDApi(Base): funcs = [ "array", "bdate_range", + "col", "concat", "crosstab", "cut", @@ -260,6 +261,7 @@ class TestApi(Base): "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", + "Expression", "FrozenList", "JsonReader", "NaTType", diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py new file mode 100644 index 0000000000000..c884540abfed0 --- /dev/null +++ b/pandas/tests/test_col.py @@ -0,0 +1,99 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.typing import Expression +from pandas.tests.test_register_accessor import ensure_removed + + +@pytest.mark.parametrize( + ("expr", "expected_values", "expected_str"), + [ + (pd.col("a"), [1, 2], "col('a')"), + (pd.col("a") * 2, [2, 4], "(col('a') * 2)"), + (pd.col("a").sum(), [3, 3], "col('a').sum()"), + (pd.col("a") + 1, [2, 3], "(col('a') + 1)"), + (1 + pd.col("a"), [2, 3], "(1 + col('a'))"), + (pd.col("a") - 1, [0, 1], "(col('a') - 1)"), + (1 - pd.col("a"), [0, -1], "(1 - col('a'))"), + (pd.col("a") * 1, [1, 2], "(col('a') * 1)"), + (1 * pd.col("a"), [1, 2], "(1 * col('a'))"), + (pd.col("a") / 1, [1.0, 2.0], "(col('a') / 1)"), + (1 / pd.col("a"), [1.0, 0.5], "(1 / col('a'))"), + (pd.col("a") // 1, [1, 2], "(col('a') // 1)"), + (1 // pd.col("a"), [1, 0], "(1 // col('a'))"), + (pd.col("a") % 1, [0, 0], "(col('a') % 1)"), + (1 % pd.col("a"), [0, 1], "(1 % col('a'))"), + (pd.col("a") > 1, [False, True], "(col('a') > 1)"), + (pd.col("a") >= 1, [True, True], "(col('a') >= 1)"), + (pd.col("a") < 1, [False, False], "(col('a') < 1)"), + (pd.col("a") <= 1, [True, False], "(col('a') <= 1)"), + (pd.col("a") == 1, [True, False], "(col('a') == 1)"), + (np.power(pd.col("a"), 2), [1, 4], "power(col('a'), 2)"), + (np.divide(pd.col("a"), pd.col("a")), [1.0, 1.0], "divide(col('a'), col('a'))"), + ], +) +def test_col_simple( + expr: Expression, expected_values: list[object], expected_str: str +) -> None: + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + result = df.assign(c=expr) + expected = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": expected_values}) + tm.assert_frame_equal(result, expected) + assert str(expr) == expected_str + + +@pytest.mark.parametrize( + ("expr", "expected_values", "expected_str"), + [ + (pd.col("a").dt.year, [2020], "col('a').dt.year"), + (pd.col("a").dt.strftime("%B"), ["January"], "col('a').dt.strftime('%B')"), + (pd.col("b").str.upper(), ["FOO"], "col('b').str.upper()"), + ], +) +def test_namespaces( + expr: Expression, expected_values: list[object], expected_str: str +) -> None: + df = pd.DataFrame({"a": [datetime(2020, 1, 1)], "b": ["foo"]}) + result = df.assign(c=expr) + expected = pd.DataFrame( + {"a": [datetime(2020, 1, 1)], "b": ["foo"], "c": expected_values} + ) + tm.assert_frame_equal(result, expected, check_dtype=False) + assert str(expr) == expected_str + + +def test_invalid() -> None: + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + with pytest.raises(ValueError, match=r"did you mean one of \['a', 'b'\] instead"): + df.assign(c=pd.col("c").mean()) + df = pd.DataFrame({f"col_{i}": [0] for i in range(11)}) + msg = ( + "did you mean one of " + r"\['col_0', 'col_1', 'col_2', 'col_3', " + "'col_4', 'col_5', 'col_6', 'col_7', " + r"'col_8', 'col_9',\.\.\.\] instead" + ) + "" + with pytest.raises(ValueError, match=msg): + df.assign(c=pd.col("c").mean()) + + +def test_custom_accessor() -> None: + df = pd.DataFrame({"a": [1, 2, 3]}) + + class XYZAccessor: + def __init__(self, pandas_obj): + self._obj = pandas_obj + + def mean(self): + return self._obj.mean() + + with ensure_removed(pd.Series, "xyz"): + pd.api.extensions.register_series_accessor("xyz")(XYZAccessor) + result = df.assign(b=pd.col("a").xyz.mean()) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [2.0, 2.0, 2.0]}) + tm.assert_frame_equal(result, expected)