Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1022,11 +1022,11 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`)
- Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`)
- Performance improvement in :meth:`merge` and ``DataFrame.merge``.Now user can use prefixes or both of suffixes and prefixes to differentiate duplicated columns. (:issue:`63014`)
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
- Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.bug_fixes:

Expand Down
20 changes: 20 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,13 +368,29 @@
sort : bool, default False
Sort the join keys lexicographically in the result DataFrame. If False,
the order of the join keys depends on the join type (how keyword).
diff_option : Literal str
The allowed values are "suffix"、"prefix"、"both",default "suffix".
If the value is "suffix", the duplicated columns will be differentiated
using the suffixes provided by parameter "suffixes".
If the value is "prefix", the duplicated columns will be differentiated
using the prefixes provided by parameter "prefixes".
If the value is "both", the duplicated columns will be differentiated
using both the suffixes provided by parameter "suffixes" and
the prefixes provided by parameter "prefixes".
suffixes : list-like, default is ("_x", "_y")
A length-2 sequence where each element is optionally a string
indicating the suffix to add to overlapping column names in
`left` and `right` respectively. Pass a value of `None` instead
of a string to indicate that the column name from `left` or
`right` should be left as-is, with no suffix. At least one of the
values must not be None.
prefixes : list-like, default is ("a_", "b_")
A length-2 sequence where each element is optionally a string
indicating the prefix to add to overlapping column names in
`left` and `right` respectively. Pass a value of `None` instead
of a string to indicate that the column name from `left` or
`right` should be left as-is, with no prefix. At least one of the
values must not be None.
copy : bool, default False
If False, avoid copy if possible.

Expand Down Expand Up @@ -11437,7 +11453,9 @@ def merge(
left_index: bool = False,
right_index: bool = False,
sort: bool = False,
diff_option: Literal["prefix", "suffix", "both"] = "suffix",
suffixes: Suffixes = ("_x", "_y"),
prefixes: Sequence[str | None] = ("a_", "b_"),
copy: bool | lib.NoDefault = lib.no_default,
indicator: str | bool = False,
validate: MergeValidate | None = None,
Expand All @@ -11456,7 +11474,9 @@ def merge(
left_index=left_index,
right_index=right_index,
sort=sort,
diff_option=diff_option,
suffixes=suffixes,
prefixes=prefixes,
indicator=indicator,
validate=validate,
)
Expand Down
118 changes: 99 additions & 19 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,11 @@ def merge(
left_index: bool = False,
right_index: bool = False,
sort: bool = False,
diff_option: Literal[
"prefix", "suffix", "both"
] = "suffix", # add new parameter prefixes diff_option
suffixes: Suffixes = ("_x", "_y"),
prefixes: Sequence[str | None] = ("a_", "b_"), # add new parameter prefixes
copy: bool | lib.NoDefault = lib.no_default,
indicator: str | bool = False,
validate: str | None = None,
Expand Down Expand Up @@ -221,13 +225,29 @@ def merge(
sort : bool, default False
Sort the join keys lexicographically in the result DataFrame. If False,
the order of the join keys depends on the join type (how keyword).
diff_option : Literal str
The allowed values are "suffix"、"prefix"、"both",default "suffix".
If the value is "suffix", the duplicated columns will be differentiated
using the suffixes provided by parameter "suffixes".
If the value is "prefix", the duplicated columns will be differentiated
using the prefixes provided by parameter "prefixes".
If the value is "both", the duplicated columns will be differentiated
using both the suffixes provided by parameter "suffixes" and
the prefixes provided by parameter "prefixes".
suffixes : list-like, default is ("_x", "_y")
A length-2 sequence where each element is optionally a string
indicating the suffix to add to overlapping column names in
`left` and `right` respectively. Pass a value of `None` instead
of a string to indicate that the column name from `left` or
`right` should be left as-is, with no suffix. At least one of the
values must not be None.
prefixes : list-like, default is ("a_", "b_")
A length-2 sequence where each element is optionally a string
indicating the prefix to add to overlapping column names in
`left` and `right` respectively. Pass a value of `None` instead
of a string to indicate that the column name from `left` or
`right` should be left as-is, with no prefix. At least one of the
values must not be None.
copy : bool, default False
If False, avoid copy if possible.

Expand Down Expand Up @@ -370,6 +390,13 @@ def merge(
left_df = _validate_operand(left)
left._check_copy_deprecation(copy)
right_df = _validate_operand(right)

if diff_option != "prefix" and diff_option != "suffix" and diff_option != "both":
raise ValueError(
"Parameter 'diff_option' is wrong, please choose from 'prefix'"
", 'suffix' and 'both'."
)

if how == "cross":
return _cross_merge(
left_df,
Expand All @@ -380,7 +407,9 @@ def merge(
left_index=left_index,
right_index=right_index,
sort=sort,
diff_option=diff_option,
suffixes=suffixes,
prefixes=prefixes,
indicator=indicator,
validate=validate,
)
Expand All @@ -395,7 +424,9 @@ def merge(
left_index=left_index,
right_index=right_index,
sort=sort,
diff_option=diff_option,
suffixes=suffixes,
prefixes=prefixes,
indicator=indicator,
validate=validate,
)
Expand All @@ -411,7 +442,11 @@ def _cross_merge(
left_index: bool = False,
right_index: bool = False,
sort: bool = False,
diff_option: Literal[
"prefix", "suffix", "both"
] = "suffix", # add new parameter prefixes diff_option
suffixes: Suffixes = ("_x", "_y"),
prefixes: Sequence[str | None] = ("a_", "b_"), # add new parameter prefixes
indicator: str | bool = False,
validate: str | None = None,
) -> DataFrame:
Expand Down Expand Up @@ -447,7 +482,9 @@ def _cross_merge(
left_index=left_index,
right_index=right_index,
sort=sort,
diff_option=diff_option,
suffixes=suffixes,
prefixes=prefixes,
indicator=indicator,
validate=validate,
)
Expand Down Expand Up @@ -954,7 +991,9 @@ class _MergeOperation:
left_index: bool
right_index: bool
sort: bool
diff_option: Literal["prefix", "suffix", "both"]
suffixes: Suffixes
prefixes: Sequence[str | None]
indicator: str | bool
validate: str | None
join_names: list[Hashable]
Expand All @@ -972,7 +1011,11 @@ def __init__(
left_index: bool = False,
right_index: bool = False,
sort: bool = True,
diff_option: Literal[
"prefix", "suffix", "both"
] = "suffix", # add new parameter prefixes diff_option
suffixes: Suffixes = ("_x", "_y"),
prefixes: Sequence[str | None] = ("a_", "b_"), # add new parameter prefixes
indicator: str | bool = False,
validate: str | None = None,
) -> None:
Expand All @@ -985,6 +1028,8 @@ def __init__(
self.on = com.maybe_make_list(on)

self.suffixes = suffixes
self.prefixes = prefixes
self.diff_option = diff_option
self.sort = sort or how == "outer"

self.left_index = left_index
Expand Down Expand Up @@ -1094,8 +1139,12 @@ def _reindex_and_concat(
left = self.left[:]
right = self.right[:]

llabels, rlabels = _items_overlap_with_suffix(
self.left._info_axis, self.right._info_axis, self.suffixes
llabels, rlabels = _items_overlap_with_suffix_or_prefix(
self.left._info_axis,
self.right._info_axis,
self.suffixes,
self.prefixes,
self.diff_option,
)

if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):
Expand Down Expand Up @@ -3059,54 +3108,84 @@ def _validate_operand(obj: DataFrame | Series) -> DataFrame:
)


def _items_overlap_with_suffix(
left: Index, right: Index, suffixes: Suffixes
def _items_overlap_with_suffix_or_prefix(
left: Index,
right: Index,
suffixes: Suffixes,
prefixes: Sequence[str | None],
diff_option: Literal["prefix", "suffix", "both"],
) -> tuple[Index, Index]:
"""
Suffixes type validation.
Suffixes and Prefixes type validation.

If two indices overlap, add suffixes to overlapping entries.
If two indices overlap, add suffixes and prefixes to overlapping entries.

If corresponding suffix is empty, the entry is simply converted to string.
If corresponding suffix and prefix are empty,
the entry is simply converted to string.

"""
if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):
if (diff_option == "both" or diff_option == "suffix") and (
not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict)
):
raise TypeError(
f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "
"Provide 'suffixes' as a tuple instead."
)
if (diff_option == "both" or diff_option == "prefix") and (
not is_list_like(prefixes, allow_sets=False) or isinstance(prefixes, dict)
):
raise TypeError(
f"Passing 'prefixes' as a {type(prefixes)}, is not supported. "
"Provide 'prefixes' as a tuple instead."
)

to_rename = left.intersection(right)
if len(to_rename) == 0:
return left, right

lsuffix, rsuffix = suffixes
if diff_option == "both" or diff_option == "suffix":
lsuffix, rsuffix = suffixes
else:
lsuffix, rsuffix = None, None

if not lsuffix and not rsuffix:
raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
if diff_option == "both" or diff_option == "prefix":
lprefix, rprefix = prefixes
else:
lprefix, rprefix = None, None

if not lsuffix and not rsuffix and not lprefix and not rprefix:
raise ValueError(
f"columns overlap but no suffix or prefix specified: {to_rename}"
)

def renamer(x, suffix: str | None):
def renamer(x, suffix: str | None, prefix: str | None):
"""
Rename the left and right indices.

If there is overlap, and suffix is not None, add
suffix, otherwise, leave it as-is.
If there is overlap, and suffix or prefix is not None, add
suffix or prefix(or both if both are provided), otherwise, leave it as-is.

Parameters
----------
x : original column name
suffix : str or None
prefix : str or None

Returns
-------
x : renamed column name
"""
if x in to_rename and suffix is not None:
return f"{x}{suffix}"
ret = x
if x in to_rename:
if suffix is not None:
ret = f"{ret}{suffix}"
if prefix is not None:
ret = f"{prefix}{ret}"
return ret
return x

lrenamer = partial(renamer, suffix=lsuffix)
rrenamer = partial(renamer, suffix=rsuffix)
lrenamer = partial(renamer, suffix=lsuffix, prefix=lprefix)
rrenamer = partial(renamer, suffix=rsuffix, prefix=rprefix)

llabels = left._transform_index(lrenamer)
rlabels = right._transform_index(rrenamer)
Expand All @@ -3123,7 +3202,8 @@ def renamer(x, suffix: str | None):
dups.extend(rlabels.intersection(left.difference(to_rename)).tolist())
if dups:
raise MergeError(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
f"Passing 'suffixes' or/and 'prefixes' "
f"which cause duplicate columns {set(dups)} is "
"not allowed.",
)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ def test_join_dups(self):
# GH 40991: As of 2.0 causes duplicate columns
with pytest.raises(
pd.errors.MergeError,
match="Passing 'suffixes' which cause duplicate columns",
match="Passing 'suffixes' or/and 'prefixes' which cause duplicate columns",
):
dta.merge(w, left_index=True, right_index=True)

Expand Down
Loading
Loading