Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 35 additions & 9 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1954,42 +1954,68 @@ def _validate_left_right_on(self, left_on, right_on):
def _validate_validate_kwd(self, validate: str) -> None:
# Check uniqueness of each
if self.left_index:
left_unique = self.orig_left.index.is_unique
left_keys = self.orig_left.index
else:
left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
left_keys = MultiIndex.from_arrays(self.left_join_keys)

if self.right_index:
right_unique = self.orig_right.index.is_unique
right_keys = self.orig_right.index
else:
right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
right_keys = MultiIndex.from_arrays(self.right_join_keys)

left_unique = left_keys.is_unique
right_unique = right_keys.is_unique

def sample_duplicates(keys, limit=10):
"""Return up to 'limit' unique duplicate keys."""
dups = keys[keys.duplicated()]
if not len(dups):
return []
return list(dups.unique()[:limit])

# Check data integrity
if validate in ["one_to_one", "1:1"]:
if not left_unique and not right_unique:
left_sample = sample_duplicates(left_keys, limit=5)
right_sample = sample_duplicates(right_keys, limit=5)

raise MergeError(
"Merge keys are not unique in either left "
"or right dataset; not a one-to-one merge"
"or right dataset; not a one-to-one merge. "
f"Offending keys in left dataset (sample): {left_sample} "
f"Offending keys in right dataset (sample): {right_sample} "
)
if not left_unique:
sample = sample_duplicates(left_keys)
raise MergeError(
"Merge keys are not unique in left dataset; not a one-to-one merge"
"Merge keys are not unique in left dataset; "
"not a one-to-one merge. "
f"Offending keys (sample): {sample}"
)
if not right_unique:
sample = sample_duplicates(right_keys)
raise MergeError(
"Merge keys are not unique in right dataset; not a one-to-one merge"
"Merge keys are not unique in right dataset; "
"not a one-to-one merge. "
f"Offending keys (sample): {sample}"
)

elif validate in ["one_to_many", "1:m"]:
if not left_unique:
sample = sample_duplicates(left_keys)
raise MergeError(
"Merge keys are not unique in left dataset; not a one-to-many merge"
"Merge keys are not unique in left dataset; "
"not a one-to-many merge. "
f"Offending keys (sample): {sample}"
)

elif validate in ["many_to_one", "m:1"]:
if not right_unique:
sample = sample_duplicates(right_keys)
raise MergeError(
"Merge keys are not unique in right dataset; "
"not a many-to-one merge"
"not a many-to-one merge. "
f"Offending keys (sample): {sample}"
)

elif validate in ["many_to_many", "m:m"]:
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3121,3 +3121,9 @@ def test_merge_pyarrow_datetime_duplicates():
)
expected = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_frame_equal(result, expected)


def test_merge_validate_one_to_one_offending_keys():
df = DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]})
with pytest.raises(pd.errors.MergeError, match="Offending keys"):
df.merge(df, on="a", validate="one_to_one")
Loading