diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7240db129b246..cf96746b237ff 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -32,9 +32,6 @@ SpecificationError, ) from pandas.util._decorators import ( - Appender, - Substitution, - doc, set_module, ) from pandas.util._exceptions import find_stack_level @@ -71,7 +68,6 @@ from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, - _transform_template, ) from pandas.core.indexes.api import ( Index, @@ -409,7 +405,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. * ``None`` : Defaults to ``'cython'`` or globally setting - ``compute.use_numba`` + ``compute.use_numba``. engine_kwargs : dict, default None * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` @@ -665,27 +661,123 @@ def _wrap_applied_output( result.index = default_index(len(result)) return result.__finalize__(self.obj, method="groupby") - __examples_series_doc = dedent( - """ - >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], - ... index=["Falcon", "Falcon", "Parrot", "Parrot"], - ... name="Max Speed") - >>> grouped = ser.groupby([1, 1, 2, 2]) - >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) - Falcon 0.707107 - Falcon -0.707107 - Parrot 0.707107 - Parrot -0.707107 - Name: Max Speed, dtype: float64 + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Call function producing a same-indexed Series on each group. - Broadcast result of the transformation + Returns a Series having the same indexes as the original object + filled with the transformed values. - >>> grouped.transform(lambda x: x.max() - x.min()) - Falcon 40.0 - Falcon 40.0 - Parrot 10.0 - Parrot 10.0 - Name: Max Speed, dtype: float64 + Parameters + ---------- + func : function, str + Function to apply to each group. See the Notes section below for + requirements. + + Accepted inputs are: + + - String + - Python function + - Numba JIT function with ``engine='numba'`` specified. + + Only passing a single function is supported with this engine. + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + If a string is chosen, then it needs to be the name + of the groupby method you want to use. + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``cython`` or global setting ``compute.use_numba``. + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function. + + **kwargs + Keyword arguments to be passed into func. + + Returns + ------- + Series + Series with the same indexes as the original object filled + with transformed values. + + See Also + -------- + Series.groupby.apply : Apply function ``func`` group-wise and combine + the results together. + Series.groupby.aggregate : Aggregate using one or more operations. + Series.transform : Call ``func`` on self producing a Series with the + same axis shape as self. + + Notes + ----- + Each group is endowed the attribute 'name' in case you need to know + which group you are working on. + + The current implementation imposes three requirements on f: + + * f must return a value that either has the same shape as the input + subframe or can be broadcast to the shape of the input subframe. + For example, if `f` returns a scalar it will be broadcast to have the + same shape as the input subframe. + * if this is a DataFrame, f must support application column-by-column + in the subframe. If f also supports application to the entire subframe, + then a fast path is used starting from the second chunk. + * f must not mutate groups. Mutation is not supported and may + produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. + + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + .. versionchanged:: 2.0.0 + + When using ``.transform`` on a grouped DataFrame and the transformation + function returns a DataFrame, pandas now aligns the result's index + with the input's index. You can call ``.to_numpy()`` on the + result of the transformation function to avoid alignment. + + Examples + -------- + + >>> ser = pd.Series( + ... [390.0, 350.0, 30.0, 20.0], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed", + ... ) + >>> grouped = ser.groupby([1, 1, 2, 2]) + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + Falcon 0.707107 + Falcon -0.707107 + Parrot 0.707107 + Parrot -0.707107 + Name: Max Speed, dtype: float64 + + Broadcast result of the transformation: + + >>> grouped.transform(lambda x: x.max() - x.min()) + Falcon 40.0 + Falcon 40.0 + Parrot 10.0 + Parrot 10.0 + Name: Max Speed, dtype: float64 >>> grouped.transform("mean") Falcon 370.0 @@ -694,21 +786,16 @@ def _wrap_applied_output( Parrot 25.0 Name: Max Speed, dtype: float64 - The resulting dtype will reflect the return value of the passed ``func``, - for example: - - >>> grouped.transform(lambda x: x.astype(int).max()) - Falcon 390 - Falcon 390 - Parrot 30 - Parrot 30 - Name: Max Speed, dtype: int64 - """ - ) - - @Substitution(klass="Series", example=__examples_series_doc) - @Appender(_transform_template) - def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + The resulting dtype will reflect the return value of the passed ``func``, + for example: + + >>> grouped.transform(lambda x: x.astype(int).max()) + Falcon 390 + Falcon 390 + Parrot 30 + Parrot 30 + Name: Max Speed, dtype: int64 + """ return self._transform( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) @@ -903,8 +990,246 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: result.index = default_index(len(result)) return result - @doc(Series.describe) def describe(self, percentiles=None, include=None, exclude=None) -> Series: + """ + Generate descriptive statistics. + + Descriptive statistics include those that summarize the central + tendency, dispersion and shape of a + dataset's distribution, excluding ``NaN`` values. + + Analyzes both numeric and object series, as well + as ``DataFrame`` column sets of mixed data types. The output + will vary depending on what is provided. Refer to the notes + below for more detail. + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should + fall between 0 and 1. The default, ``None``, will automatically + return the 25th, 50th, and 75th percentiles. + include : 'all', list-like of dtypes or None (default), optional + A white list of data types to include in the result. Ignored + for ``Series``. Here are the options: + + - 'all' : All columns of the input will be included in the output. + - A list-like of dtypes : Limits the results to the + provided data types. + To limit the result to numeric types submit + ``numpy.number``. To limit it instead to object columns submit + the ``numpy.object`` data type. Strings + can also be used in the style of + ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To + select pandas categorical columns, use ``'category'`` + - None (default) : The result will include all numeric columns. + exclude : list-like of dtypes or None (default), optional, + A black list of data types to omit from the result. Ignored + for ``Series``. Here are the options: + + - A list-like of dtypes : Excludes the provided data types + from the result. To exclude numeric types submit + ``numpy.number``. To exclude object columns submit the data + type ``numpy.object``. Strings can also be used in the style of + ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To + exclude pandas categorical columns, use ``'category'`` + - None (default) : The result will exclude nothing. + + Returns + ------- + Series or DataFrame + Summary statistics of the Series or Dataframe provided. + + See Also + -------- + DataFrame.count: Count number of non-NA/null observations. + DataFrame.max: Maximum of the values in the object. + DataFrame.min: Minimum of the values in the object. + DataFrame.mean: Mean of the values. + DataFrame.std: Standard deviation of the observations. + DataFrame.select_dtypes: Subset of a DataFrame including/excluding + columns based on their dtype. + + Notes + ----- + For numeric data, the result's index will include ``count``, + ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and + upper percentiles. By default the lower percentile is ``25`` and the + upper percentile is ``75``. The ``50`` percentile is the + same as the median. + + For object data (e.g. strings), the result's index + will include ``count``, ``unique``, ``top``, and ``freq``. The ``top`` + is the most common value. The ``freq`` is the most common value's + frequency. + + If multiple object values have the highest count, then the + ``count`` and ``top`` results will be arbitrarily chosen from + among those with the highest count. + + For mixed data types provided via a ``DataFrame``, the default is to + return only an analysis of numeric columns. If the DataFrame consists + only of object and categorical data without any numeric columns, the + default is to return an analysis of both the object and categorical + columns. If ``include='all'`` is provided as an option, the result + will include a union of attributes of each type. + + The `include` and `exclude` parameters can be used to limit + which columns in a ``DataFrame`` are analyzed for the output. + The parameters are ignored when analyzing a ``Series``. + + Examples + -------- + Describing a numeric ``Series``. + + >>> s = pd.Series([1, 2, 3]) + >>> s.describe() + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + dtype: float64 + + Describing a categorical ``Series``. + + >>> s = pd.Series(["a", "a", "b", "c"]) + >>> s.describe() + count 4 + unique 3 + top a + freq 2 + dtype: object + + Describing a timestamp ``Series``. + + >>> s = pd.Series( + ... [ + ... np.datetime64("2000-01-01"), + ... np.datetime64("2010-01-01"), + ... np.datetime64("2010-01-01"), + ... ] + ... ) + >>> s.describe() + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 + dtype: object + + Describing a ``DataFrame``. By default only numeric fields + are returned. + + >>> df = pd.DataFrame( + ... { + ... "categorical": pd.Categorical(["d", "e", "f"]), + ... "numeric": [1, 2, 3], + ... "object": ["a", "b", "c"], + ... } + ... ) + >>> df.describe() + numeric + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + + Describing all columns of a ``DataFrame`` regardless of data type. + + >>> df.describe(include="all") # doctest: +SKIP + categorical numeric object + count 3 3.0 3 + unique 3 NaN 3 + top f NaN a + freq 1 NaN 1 + mean NaN 2.0 NaN + std NaN 1.0 NaN + min NaN 1.0 NaN + 25% NaN 1.5 NaN + 50% NaN 2.0 NaN + 75% NaN 2.5 NaN + max NaN 3.0 NaN + + Describing a column from a ``DataFrame`` by accessing it as + an attribute. + + >>> df.numeric.describe() + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + Name: numeric, dtype: float64 + + Including only numeric columns in a ``DataFrame`` description. + + >>> df.describe(include=[np.number]) + numeric + count 3.0 + mean 2.0 + std 1.0 + min 1.0 + 25% 1.5 + 50% 2.0 + 75% 2.5 + max 3.0 + + Including only string columns in a ``DataFrame`` description. + + >>> df.describe(include=[object]) # doctest: +SKIP + object + count 3 + unique 3 + top a + freq 1 + + Including only categorical columns from a ``DataFrame`` description. + + >>> df.describe(include=["category"]) + categorical + count 3 + unique 3 + top d + freq 1 + + Excluding numeric columns from a ``DataFrame`` description. + + >>> df.describe(exclude=[np.number]) # doctest: +SKIP + categorical object + count 3 3 + unique 3 3 + top f a + freq 1 1 + + Excluding object columns from a ``DataFrame`` description. + + >>> df.describe(exclude=[object]) # doctest: +SKIP + categorical numeric + count 3 3.0 + unique 3 NaN + top f NaN + freq 1 NaN + mean NaN 2.0 + std NaN 1.0 + min NaN 1.0 + 25% NaN 1.5 + 50% NaN 2.0 + 75% NaN 2.5 + max NaN 3.0 + """ return super().describe( percentiles=percentiles, include=include, exclude=exclude ) @@ -1391,15 +1716,332 @@ def alt(obj): ) @property - @doc(Series.plot.__doc__) def plot(self) -> GroupByPlot: + """ + Make plots of Series or DataFrame. + + Uses the backend specified by the + option ``plotting.backend``. By default, matplotlib is used. + + Parameters + ---------- + data : Series or DataFrame + The object for which the method is called. + + Attributes + ---------- + x : label or position, default None + Only used if data is a DataFrame. + y : label, position or list of label, positions, default None + Allows plotting of one column versus another. Only used if data is a + DataFrame. + kind : str + The kind of plot to produce: + + - 'line' : line plot (default) + - 'bar' : vertical bar plot + - 'barh' : horizontal bar plot + - 'hist' : histogram + - 'box' : boxplot + - 'kde' : Kernel Density Estimation plot + - 'density' : same as 'kde' + - 'area' : area plot + - 'pie' : pie plot + - 'scatter' : scatter plot (DataFrame only) + - 'hexbin' : hexbin plot (DataFrame only) + ax : matplotlib axes object, default None + An axes of the current figure. + subplots : bool or sequence of iterables, default False + Whether to group columns into subplots: + + - ``False`` : No subplots will be used + - ``True`` : Make separate subplots for each column. + - sequence of iterables of column labels: Create a subplot for each + group of columns. For example `[('a', 'c'), ('b', 'd')]` will + create 2 subplots: one with columns 'a' and 'c', and one + with columns 'b' and 'd'. Remaining columns that aren't specified + will be plotted in additional subplots (one per column). + + .. versionadded:: 1.5.0 + + sharex : bool, default True if ax is None else False + In case ``subplots=True``, share x axis and set some x axis + labels to invisible; + defaults to True if ax is None otherwise False if + an ax is passed in; Be aware, that passing in both an ax and + ``sharex=True`` will alter all x axis labels for all axis in a figure. + sharey : bool, default False + In case ``subplots=True``, share y axis and set some y axis + labels to invisible. + layout : tuple, optional + (rows, columns) for the layout of subplots. + figsize : a tuple (width, height) in inches + Size of a figure object. + use_index : bool, default True + Use index as ticks for x axis. + title : str or list + Title to use for the plot. If a string is passed, print the string + at the top of the figure. If a list is passed and `subplots` is + True, print each item in the list above the corresponding subplot. + grid : bool, default None (matlab style default) + Axis grid lines. + legend : bool or {'reverse'} + Place legend on axis subplots. + style : list or dict + The matplotlib line style per column. + logx : bool or 'sym', default False + Use log scaling or symlog scaling on x axis. + + logy : bool or 'sym' default False + Use log scaling or symlog scaling on y axis. + + loglog : bool or 'sym', default False + Use log scaling or symlog scaling on both x and y axes. + + xticks : sequence + Values to use for the xticks. + yticks : sequence + Values to use for the yticks. + xlim : 2-tuple/list + Set the x limits of the current axes. + ylim : 2-tuple/list + Set the y limits of the current axes. + xlabel : label, optional + Name to use for the xlabel on x-axis. Default uses index name as xlabel, + or the x-column name for planar plots. + + .. versionchanged:: 2.0.0 + + Now applicable to histograms. + + ylabel : label, optional + Name to use for the ylabel on y-axis. Default will show no ylabel, or the + y-column name for planar plots. + + .. versionchanged:: 2.0.0 + + Now applicable to histograms. + + rot : float, default None + Rotation for ticks (xticks for vertical, yticks for horizontal + plots). + fontsize : float, default None + Font size for xticks and yticks. + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + colorbar : bool, optional + If True, plot colorbar (only relevant for 'scatter' and 'hexbin' + plots). + position : float + Specify relative alignments for bar plot layout. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center). + table : bool, Series or DataFrame, default False + If True, draw a table using the data in the DataFrame and the data + will be transposed to meet matplotlib's default layout. + If a Series or DataFrame is passed, use passed data to draw a + table. + yerr : DataFrame, Series, array-like, dict and str + See :ref:`Plotting with Error Bars ` for + detail. + xerr : DataFrame, Series, array-like, dict and str + Equivalent to yerr. + stacked : bool, default False in line and bar plots, and True in area plot + If True, create stacked plot. + secondary_y : bool or sequence, default False + Whether to plot on the secondary y-axis if a list/tuple, which + columns to plot on secondary y-axis. + mark_right : bool, default True + When using a secondary_y axis, automatically mark the column + labels with "(right)" in the legend. + include_bool : bool, default is False + If True, boolean values can be plotted. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + If the backend is not the default matplotlib one, the return value + will be the object returned by the backend. + + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + DataFrame.hist : Make a histogram. + DataFrame.boxplot : Make a box plot. + DataFrame.plot.scatter : Make a scatter plot with varying marker + point size and color. + DataFrame.plot.hexbin : Make a hexagonal binning plot of + two variables. + DataFrame.plot.kde : Make Kernel Density Estimate plot using + Gaussian kernels. + DataFrame.plot.area : Make a stacked area plot. + DataFrame.plot.bar : Make a bar plot. + DataFrame.plot.barh : Make a horizontal bar plot. + + Notes + ----- + - See matplotlib documentation online for more on this subject + - If `kind` = 'bar' or 'barh', you can specify relative alignments + for bar plot layout by `position` keyword. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center) + + Examples + -------- + For Series: + + .. plot:: + :context: close-figs + + >>> ser = pd.Series([1, 2, 3, 3]) + >>> plot = ser.plot(kind="hist", title="My plot") + + For DataFrame: + + .. plot:: + :context: close-figs + >>> df = pd.DataFrame( + ... { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... }, + ... index=["pig", "rabbit", "duck", "chicken", "horse"], + ... ) + >>> plot = df.plot(title="DataFrame Plot") + + For SeriesGroupBy: + + .. plot:: + :context: close-figs + >>> lst = [-1, -2, -3, 1, 2, 3] + >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) + >>> plot = ser.groupby(lambda x: x > 0).plot(title="SeriesGroupBy Plot") + + For DataFrameGroupBy: + + .. plot:: + :context: close-figs + >>> df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": ["A", "B", "A", "B"]}) + >>> plot = df.groupby("col2").plot( + ... kind="bar", title="DataFrameGroupBy Plot" + ... ) + """ result = GroupByPlot(self) return result - @doc(Series.nlargest.__doc__) def nlargest( self, n: int = 5, keep: Literal["first", "last", "all"] = "first" ) -> Series: + """ + Return the largest `n` elements. + + Parameters + ---------- + n : int, default 5 + Return this many descending sorted values. + keep : {'first', 'last', 'all'}, default 'first' + When there are duplicate values that cannot all fit in a + Series of `n` elements: + + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. + - ``all`` : keep all occurrences. This can result in a Series of + size larger than `n`. + + Returns + ------- + Series + The `n` largest values in the Series, sorted in decreasing order. + + See Also + -------- + Series.nsmallest: Get the `n` smallest elements. + Series.sort_values: Sort Series by values. + Series.head: Return the first `n` rows. + + Notes + ----- + Faster than ``.sort_values(ascending=False).head(n)`` for small `n` + relative to the size of the ``Series`` object. + + Examples + -------- + >>> countries_population = { + ... "Italy": 59000000, + ... "France": 65000000, + ... "Malta": 434000, + ... "Maldives": 434000, + ... "Brunei": 434000, + ... "Iceland": 337000, + ... "Nauru": 11300, + ... "Tuvalu": 11300, + ... "Anguilla": 11300, + ... "Montserrat": 5200, + ... } + >>> s = pd.Series(countries_population) + >>> s + Italy 59000000 + France 65000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + Iceland 337000 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Montserrat 5200 + dtype: int64 + + The `n` largest elements where ``n=5`` by default. + + >>> s.nlargest() + France 65000000 + Italy 59000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + dtype: int64 + + The `n` largest elements where ``n=3``. Default `keep` value is 'first' + so Malta will be kept. + + >>> s.nlargest(3) + France 65000000 + Italy 59000000 + Malta 434000 + dtype: int64 + + The `n` largest elements where ``n=3`` and keeping the last duplicates. + Brunei will be kept since it is the last with value 434000 based on + the index order. + + >>> s.nlargest(3, keep="last") + France 65000000 + Italy 59000000 + Brunei 434000 + dtype: int64 + + The `n` largest elements where ``n=3`` with all duplicates kept. Note + that the returned Series has five elements due to the three duplicates. + + >>> s.nlargest(3, keep="all") + France 65000000 + Italy 59000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + dtype: int64 + """ f = partial(Series.nlargest, n=n, keep=keep) data = self._obj_with_exclusions # Don't change behavior if result index happens to be the same, i.e. @@ -1407,10 +2049,110 @@ def nlargest( result = self._python_apply_general(f, data, not_indexed_same=True) return result - @doc(Series.nsmallest.__doc__) def nsmallest( self, n: int = 5, keep: Literal["first", "last", "all"] = "first" ) -> Series: + """ + Return the smallest `n` elements. + + Parameters + ---------- + n : int, default 5 + Return this many ascending sorted values. + keep : {'first', 'last', 'all'}, default 'first' + When there are duplicate values that cannot all fit in a + Series of `n` elements: + + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. + - ``all`` : keep all occurrences. This can result in a Series of + size larger than `n`. + + Returns + ------- + Series + The `n` smallest values in the Series, sorted in increasing order. + + See Also + -------- + Series.nlargest: Get the `n` largest elements. + Series.sort_values: Sort Series by values. + Series.head: Return the first `n` rows. + + Notes + ----- + Faster than ``.sort_values().head(n)`` for small `n` relative to + the size of the ``Series`` object. + + Examples + -------- + >>> countries_population = { + ... "Italy": 59000000, + ... "France": 65000000, + ... "Brunei": 434000, + ... "Malta": 434000, + ... "Maldives": 434000, + ... "Iceland": 337000, + ... "Nauru": 11300, + ... "Tuvalu": 11300, + ... "Anguilla": 11300, + ... "Montserrat": 5200, + ... } + >>> s = pd.Series(countries_population) + >>> s + Italy 59000000 + France 65000000 + Brunei 434000 + Malta 434000 + Maldives 434000 + Iceland 337000 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Montserrat 5200 + dtype: int64 + + The `n` smallest elements where ``n=5`` by default. + + >>> s.nsmallest() + Montserrat 5200 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Iceland 337000 + dtype: int64 + + The `n` smallest elements where ``n=3``. Default `keep` value is + 'first' so Nauru and Tuvalu will be kept. + + >>> s.nsmallest(3) + Montserrat 5200 + Nauru 11300 + Tuvalu 11300 + dtype: int64 + + The `n` smallest elements where ``n=3`` and keeping the last + duplicates. Anguilla and Tuvalu will be kept since they are the last + with value 11300 based on the index order. + + >>> s.nsmallest(3, keep="last") + Montserrat 5200 + Anguilla 11300 + Tuvalu 11300 + dtype: int64 + + The `n` smallest elements where ``n=3`` with all duplicates kept. Note + that the returned Series has four elements due to the three duplicates. + + >>> s.nsmallest(3, keep="all") + Montserrat 5200 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + dtype: int64 + """ f = partial(Series.nsmallest, n=n, keep=keep) data = self._obj_with_exclusions # Don't change behavior if result index happens to be the same, i.e. @@ -1540,22 +2282,126 @@ def idxmax(self, skipna: bool = True) -> Series: """ return self._idxmax_idxmin("idxmax", skipna=skipna) - @doc(Series.corr.__doc__) def corr( self, other: Series, method: CorrelationMethod = "pearson", min_periods: int | None = None, ) -> Series: + """ + Compute correlation with `other` Series, excluding missing values. + + The two `Series` objects are not required to be the same length and will be + aligned internally before the correlation function is applied. + + Parameters + ---------- + other : Series + Series with which to compute the correlation. + method : {'pearson', 'kendall', 'spearman'} or callable + Method used to compute correlation: + + - pearson : Standard correlation coefficient + - kendall : Kendall Tau correlation coefficient + - spearman : Spearman rank correlation + - callable: Callable with input two 1d ndarrays and returning a float. + + .. warning:: + Note that the returned matrix from corr will have 1 along the + diagonals and will be symmetric regardless of the callable's + behavior. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + + Returns + ------- + float + Correlation with other. + + See Also + -------- + DataFrame.corr : Compute pairwise correlation between columns. + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + + Notes + ----- + Pearson, Kendall and Spearman correlation are currently computed using + pairwise complete observations. + + * `Pearson correlation coefficient `_ + * `Kendall rank correlation coefficient `_ + * `Spearman's rank correlation coefficient `_ + + Automatic data alignment: as with all pandas operations, automatic data + alignment is performed for this method. + ``corr()`` automatically considers values with matching indices. + + Examples + -------- + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v + >>> s1 = pd.Series([0.2, 0.0, 0.6, 0.2]) + >>> s2 = pd.Series([0.3, 0.6, 0.0, 0.1]) + >>> s1.corr(s2, method=histogram_intersection) + 0.3 + + Pandas auto-aligns the values with matching indices. + + >>> s1 = pd.Series([1, 2, 3], index=[0, 1, 2]) + >>> s2 = pd.Series([1, 2, 3], index=[2, 1, 0]) + >>> s1.corr(s2) + -1.0 + + If the input is a constant array, the correlation is not defined in this case, + and ``np.nan`` is returned. + + >>> s1 = pd.Series([0.45, 0.45]) + >>> s1.corr(s1) + nan + """ result = self._op_via_apply( "corr", other=other, method=method, min_periods=min_periods ) return result - @doc(Series.cov.__doc__) def cov( self, other: Series, min_periods: int | None = None, ddof: int | None = 1 ) -> Series: + """ + Compute covariance with Series, excluding missing values. + + The two `Series` objects are not required to be the same length and + will be aligned internally before the covariance is calculated. + + Parameters + ---------- + other : Series + Series with which to compute the covariance. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + Returns + ------- + float + Covariance between Series and other normalized by N-1 + (unbiased estimator). + + See Also + -------- + DataFrame.cov : Compute pairwise covariance of columns. + + Examples + -------- + >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035]) + >>> s2 = pd.Series([0.12528585, 0.26962463, 0.51111198]) + >>> s1.cov(s2) + -0.01685762652715874 + """ result = self._op_via_apply( "cov", other=other, min_periods=min_periods, ddof=ddof ) @@ -1609,7 +2455,6 @@ def is_monotonic_decreasing(self) -> Series: """ return self.apply(lambda ser: ser.is_monotonic_decreasing) - @doc(Series.hist.__doc__) def hist( self, by=None, @@ -1625,6 +2470,72 @@ def hist( legend: bool = False, **kwargs, ): + """ + Draw histogram of the input series using matplotlib. + + Parameters + ---------- + by : object, optional + If passed, then used to form histograms for separate groups. + ax : matplotlib axis object + If not passed, uses gca(). + grid : bool, default True + Whether to show axis grid lines. + xlabelsize : int, default None + If specified changes the x-axis label size. + xrot : float, default None + Rotation of x axis labels. + ylabelsize : int, default None + If specified changes the y-axis label size. + yrot : float, default None + Rotation of y axis labels. + figsize : tuple, default None + Figure size in inches by default. + bins : int or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + legend : bool, default False + Whether to show the legend. + + **kwargs + To be passed to the actual plotting function. + + Returns + ------- + matplotlib.axes.Axes + A histogram plot. + + See Also + -------- + matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. + + Examples + -------- + For Series: + + .. plot:: + :context: close-figs + + >>> lst = ["a", "a", "a", "b", "b", "b"] + >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) + >>> hist = ser.hist() + + For Groupby: + + .. plot:: + :context: close-figs + + >>> lst = ["a", "a", "a", "b", "b", "b"] + >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) + >>> hist = ser.groupby(level=0).hist() + """ result = self._op_via_apply( "hist", by=by, @@ -1643,8 +2554,23 @@ def hist( return result @property - @doc(Series.dtype.__doc__) def dtype(self) -> Series: + """ + Return the dtype object of the underlying data. + + See Also + -------- + Series.dtypes : Return the dtype object of the underlying data. + Series.astype : Cast a pandas object to a specified dtype dtype. + Series.convert_dtypes : Convert columns to the best possible dtypes using dtypes + supporting pd.NA. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.dtype + dtype('int64') + """ return self.apply(lambda ser: ser.dtype) def unique(self) -> Series: @@ -1695,7 +2621,7 @@ def unique(self) -> Series: return result -@set_module("pandas.api.typing") +@set_module("pandas.api.typing") # type: ignore class DataFrameGroupBy(GroupBy[DataFrame]): _agg_examples_doc = dedent( """ @@ -2269,34 +3195,123 @@ def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): concatenated = concatenated.reindex(concat_index, axis=1) return self._set_result_index_ordered(concatenated) - __examples_dataframe_doc = dedent( - """ - >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - ... 'foo', 'bar'], - ... 'B' : ['one', 'one', 'two', 'three', - ... 'two', 'two'], - ... 'C' : [1, 5, 5, 2, 5, 5], - ... 'D' : [2.0, 5., 8., 1., 2., 9.]}) - >>> grouped = df.groupby('A')[['C', 'D']] - >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) - C D - 0 -1.154701 -0.577350 - 1 0.577350 0.000000 - 2 0.577350 1.154701 - 3 -1.154701 -1.000000 - 4 0.577350 -0.577350 - 5 0.577350 1.000000 - - Broadcast result of the transformation - - >>> grouped.transform(lambda x: x.max() - x.min()) - C D - 0 4.0 6.0 - 1 3.0 8.0 - 2 4.0 6.0 - 3 3.0 8.0 - 4 4.0 6.0 - 5 3.0 8.0 + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Call function producing a same-indexed Series on each group. + + Returns a Series having the same indexes as the original object + filled with the transformed values. + + Parameters + ---------- + func : function, str + Function to apply to each group. See the Notes section + below for requirements. + + Accepted inputs are: + + - String + - Python function + - Numba JIT function with ``engine='numba'`` specified. + + Only passing a single function is supported with this engine. + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + If a string is chosen, then it needs to be the name + of the groupby method you want to use. + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``cython`` or global setting ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + Keyword arguments to be passed into func. + + Returns + ------- + Series + Series with the same indexes as the original object filled + with transformed values. + + See Also + -------- + Series.groupby.apply : Apply function ``func`` group-wise and combine + the results together. + Series.groupby.aggregate : Aggregate using one or more operations. + Series.transform : Call ``func`` on self producing a Series with the + same axis shape as self. + + Notes + ----- + Each group is endowed the attribute 'name' in case you need to know + which group you are working on. + + The current implementation imposes three requirements on f: + + * f must return a value that either has the same shape as the input + subframe or can be broadcast to the shape of the input subframe. + For example, if `f` returns a scalar it will be broadcast to have the + same shape as the input subframe. + * if this is a DataFrame, f must support application column-by-column + in the subframe. If f also supports application to the entire subframe, + then a fast path is used starting from the second chunk. + * f must not mutate groups. Mutation is not supported and may + produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. + + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + .. versionchanged:: 2.0.0 + + When using ``.transform`` on a grouped DataFrame and the transformation + function returns a DataFrame, pandas now aligns the result's index + with the input's index. You can call ``.to_numpy()`` on the + result of the transformation function to avoid alignment. + + Examples + -------- + + >>> ser = pd.Series( + ... [390.0, 350.0, 30.0, 20.0], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed", + ... ) + >>> grouped = ser.groupby([1, 1, 2, 2]) + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + Falcon 0.707107 + Falcon -0.707107 + Parrot 0.707107 + Parrot -0.707107 + Name: Max Speed, dtype: float64 + + Broadcast result of the transformation + + >>> grouped.transform(lambda x: x.max() - x.min()) + Falcon 40.0 + Falcon 40.0 + Parrot 10.0 + Parrot 10.0 + Name: Max Speed, dtype: float64 >>> grouped.transform("mean") C D @@ -2307,23 +3322,16 @@ def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): 4 3.666667 4.0 5 4.000000 5.0 - The resulting dtype will reflect the return value of the passed ``func``, - for example: - - >>> grouped.transform(lambda x: x.astype(int).max()) - C D - 0 5 8 - 1 5 9 - 2 5 8 - 3 5 9 - 4 5 8 - 5 5 9 - """ - ) - - @Substitution(klass="DataFrame", example=__examples_dataframe_doc) - @Appender(_transform_template) - def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + The resulting dtype will reflect the return value of the passed ``func``, + for example: + + >>> grouped.transform(lambda x: x.astype(int).max()) + Falcon 390 + Falcon 390 + Parrot 30 + Parrot 30 + Name: Max Speed, dtype: int64 + """ return self._transform( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) @@ -3160,30 +4168,352 @@ def kurt( ) @property - @doc(DataFrame.plot.__doc__) def plot(self) -> GroupByPlot: + """ + Make plots of Series or DataFrame. + + Uses the backend specified by the + option ``plotting.backend``. By default, matplotlib is used. + + Parameters + ---------- + data : Series or DataFrame + The object for which the method is called. + + Attributes + ---------- + x : label or position, default None + Only used if data is a DataFrame. + y : label, position or list of label, positions, default None + Allows plotting of one column versus another. Only used if data is a + DataFrame. + kind : str + The kind of plot to produce: + + - 'line' : line plot (default) + - 'bar' : vertical bar plot + - 'barh' : horizontal bar plot + - 'hist' : histogram + - 'box' : boxplot + - 'kde' : Kernel Density Estimation plot + - 'density' : same as 'kde' + - 'area' : area plot + - 'pie' : pie plot + - 'scatter' : scatter plot (DataFrame only) + - 'hexbin' : hexbin plot (DataFrame only) + ax : matplotlib axes object, default None + An axes of the current figure. + subplots : bool or sequence of iterables, default False + Whether to group columns into subplots: + + - ``False`` : No subplots will be used + - ``True`` : Make separate subplots for each column. + - sequence of iterables of column labels: Create a subplot for each + group of columns. For example `[('a', 'c'), ('b', 'd')]` will + create 2 subplots: one with columns 'a' and 'c', and one + with columns 'b' and 'd'. Remaining columns that aren't specified + will be plotted in additional subplots (one per column). + + .. versionadded:: 1.5.0 + + sharex : bool, default True if ax is None else False + In case ``subplots=True``, share x axis and set some x axis labels + to invisible; defaults to True if ax is None otherwise False if + an ax is passed in; Be aware, that passing in both an ax and + ``sharex=True`` will alter all x axis labels for all axis in a figure. + sharey : bool, default False + In case ``subplots=True``, share y axis and set some y axis + labels to invisible. + layout : tuple, optional + (rows, columns) for the layout of subplots. + figsize : a tuple (width, height) in inches + Size of a figure object. + use_index : bool, default True + Use index as ticks for x axis. + title : str or list + Title to use for the plot. If a string is passed, print the string + at the top of the figure. If a list is passed and `subplots` is + True, print each item in the list above the corresponding subplot. + grid : bool, default None (matlab style default) + Axis grid lines. + legend : bool or {'reverse'} + Place legend on axis subplots. + style : list or dict + The matplotlib line style per column. + logx : bool or 'sym', default False + Use log scaling or symlog scaling on x axis. + + logy : bool or 'sym' default False + Use log scaling or symlog scaling on y axis. + + loglog : bool or 'sym', default False + Use log scaling or symlog scaling on both x and y axes. + + xticks : sequence + Values to use for the xticks. + yticks : sequence + Values to use for the yticks. + xlim : 2-tuple/list + Set the x limits of the current axes. + ylim : 2-tuple/list + Set the y limits of the current axes. + xlabel : label, optional + Name to use for the xlabel on x-axis. Default uses index name as xlabel, + or the x-column name for planar plots. + + .. versionchanged:: 2.0.0 + + Now applicable to histograms. + + ylabel : label, optional + Name to use for the ylabel on y-axis. Default will show no ylabel, or the + y-column name for planar plots. + + .. versionchanged:: 2.0.0 + + Now applicable to histograms. + + rot : float, default None + Rotation for ticks (xticks for vertical, yticks for horizontal + plots). + fontsize : float, default None + Font size for xticks and yticks. + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + colorbar : bool, optional + If True, plot colorbar (only relevant for 'scatter' and 'hexbin' + plots). + position : float + Specify relative alignments for bar plot layout. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center). + table : bool, Series or DataFrame, default False + If True, draw a table using the data in the DataFrame and the data + will be transposed to meet matplotlib's default layout. + If a Series or DataFrame is passed, use passed data to draw a + table. + yerr : DataFrame, Series, array-like, dict and str + See :ref:`Plotting with Error Bars ` for + detail. + xerr : DataFrame, Series, array-like, dict and str + Equivalent to yerr. + stacked : bool, default False in line and bar plots, and True in area plot + If True, create stacked plot. + secondary_y : bool or sequence, default False + Whether to plot on the secondary y-axis if a list/tuple, which + columns to plot on secondary y-axis. + mark_right : bool, default True + When using a secondary_y axis, automatically mark the column + labels with "(right)" in the legend. + include_bool : bool, default is False + If True, boolean values can be plotted. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + **kwargs + Options to pass to matplotlib plotting method. + + Returns + ------- + :class:`matplotlib.axes.Axes` or numpy.ndarray of them + If the backend is not the default matplotlib one, the return value + will be the object returned by the backend. + + See Also + -------- + matplotlib.pyplot.plot : Plot y versus x as lines and/or markers. + DataFrame.hist : Make a histogram. + DataFrame.boxplot : Make a box plot. + DataFrame.plot.scatter : Make a scatter plot with varying marker + point size and color. + DataFrame.plot.hexbin : Make a hexagonal binning plot of + two variables. + DataFrame.plot.kde : Make Kernel Density Estimate plot using + Gaussian kernels. + DataFrame.plot.area : Make a stacked area plot. + DataFrame.plot.bar : Make a bar plot. + DataFrame.plot.barh : Make a horizontal bar plot. + + Notes + ----- + - See matplotlib documentation online for more on this subject + - If `kind` = 'bar' or 'barh', you can specify relative alignments + for bar plot layout by `position` keyword. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 + (center) + + Examples + -------- + For Series: + + .. plot:: + :context: close-figs + + >>> ser = pd.Series([1, 2, 3, 3]) + >>> plot = ser.plot(kind="hist", title="My plot") + + For DataFrame: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame( + ... { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... }, + ... index=["pig", "rabbit", "duck", "chicken", "horse"], + ... ) + >>> plot = df.plot(title="DataFrame Plot") + + For SeriesGroupBy: + + .. plot:: + :context: close-figs + + >>> lst = [-1, -2, -3, 1, 2, 3] + >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst) + >>> plot = ser.groupby(lambda x: x > 0).plot(title="SeriesGroupBy Plot") + + For DataFrameGroupBy: + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": ["A", "B", "A", "B"]}) + >>> plot = df.groupby("col2").plot( + ... kind="bar", title="DataFrameGroupBy Plot" + ... ) + """ result = GroupByPlot(self) return result - @doc(DataFrame.corr.__doc__) def corr( self, method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", min_periods: int = 1, numeric_only: bool = False, ) -> DataFrame: + """ + Compute correlation with `other` Series, excluding missing values. + + The two `Series` objects are not required to be the same length and will be + aligned internally before the correlation function is applied. + + Parameters + ---------- + other : Series + Series with which to compute the correlation. + method : {'pearson', 'kendall', 'spearman'} or callable + Method used to compute correlation: + + - pearson : Standard correlation coefficient + - kendall : Kendall Tau correlation coefficient + - spearman : Spearman rank correlation + - callable: Callable with input two 1d ndarrays and returning a float. + + .. warning:: + Note that the returned matrix from corr will have 1 along the + diagonals and will be symmetric regardless of the callable's + behavior. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + + Returns + ------- + float + Correlation with other. + + See Also + -------- + DataFrame.corr : Compute pairwise correlation between columns. + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + + Notes + ----- + Pearson, Kendall and Spearman correlation are currently computed using + pairwise complete observations. + + * `Pearson correlation coefficient `_ + * `Kendall rank correlation coefficient `_ + * `Spearman's rank correlation coefficient `_ + + Automatic data alignment: as with all pandas operations, + automatic data alignment is performed for this method. + ``corr()`` automatically considers values with matching indices. + + Examples + -------- + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v + >>> s1 = pd.Series([0.2, 0.0, 0.6, 0.2]) + >>> s2 = pd.Series([0.3, 0.6, 0.0, 0.1]) + >>> s1.corr(s2, method=histogram_intersection) + 0.3 + + Pandas auto-aligns the values with matching indices + + >>> s1 = pd.Series([1, 2, 3], index=[0, 1, 2]) + >>> s2 = pd.Series([1, 2, 3], index=[2, 1, 0]) + >>> s1.corr(s2) + -1.0 + + If the input is a constant array, the correlation is not defined in this case, + and ``np.nan`` is returned. + + >>> s1 = pd.Series([0.45, 0.45]) + >>> s1.corr(s1) + nan + """ result = self._op_via_apply( "corr", method=method, min_periods=min_periods, numeric_only=numeric_only ) return result - @doc(DataFrame.cov.__doc__) def cov( self, min_periods: int | None = None, ddof: int | None = 1, numeric_only: bool = False, ) -> DataFrame: + """ + Compute covariance with Series, excluding missing values. + + The two `Series` objects are not required to be the same length and + will be aligned internally before the covariance is calculated. + + Parameters + ---------- + other : Series + Series with which to compute the covariance. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + Returns + ------- + float + Covariance between Series and other normalized by N-1 + (unbiased estimator). + + See Also + -------- + DataFrame.cov : Compute pairwise covariance of columns. + + Examples + -------- + >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035]) + >>> s2 = pd.Series([0.12528585, 0.26962463, 0.51111198]) + >>> s1.cov(s2) + -0.01685762652715874 + """ result = self._op_via_apply( "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only )