From 7408b85a2fdafaf2b855c25e11ecae0a7c86820f Mon Sep 17 00:00:00 2001 From: anubhav-0012 Date: Sat, 25 Oct 2025 01:05:39 +0530 Subject: [PATCH 1/4] improve merge validation error messages --- pandas/core/reshape/merge.py | 31 +++++++++++++++++++++--- pandas/tests/reshape/merge/test_merge.py | 5 ++++ 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 604181214ad44..c5e147048218a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1954,42 +1954,65 @@ def _validate_left_right_on(self, left_on, right_on): def _validate_validate_kwd(self, validate: str) -> None: # Check uniqueness of each if self.left_index: - left_unique = self.orig_left.index.is_unique + left_keys = self.orig_left.index else: - left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique + left_keys = MultiIndex.from_arrays(self.left_join_keys) if self.right_index: - right_unique = self.orig_right.index.is_unique + right_keys = self.orig_right.index else: - right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique + right_keys = MultiIndex.from_arrays(self.right_join_keys) + + left_unique = left_keys.is_unique + right_unique = right_keys.is_unique + + def sample_duplicates(keys, limit=10): + """Return up to 'limit' unique duplicate keys.""" + keys = Index(keys) + dups = keys[keys.duplicated()] + if not len(dups): + return [] + return list(dups.unique()[:limit]) # Check data integrity if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: + combined_keys = list(left_keys.append(right_keys)) + sample = sample_duplicates(combined_keys, limit=10) raise MergeError( "Merge keys are not unique in either left " "or right dataset; not a one-to-one merge" + f"Offending keys (sample): {sample}" ) if not left_unique: + sample = sample_duplicates(left_keys) raise MergeError( "Merge keys are not unique in left dataset; not a one-to-one merge" + f"Offending keys (sample): {sample}" ) if not right_unique: + sample = sample_duplicates(right_keys) raise MergeError( "Merge keys are not unique in right dataset; not a one-to-one merge" + f"Offending keys (sample): {sample}" ) elif validate in ["one_to_many", "1:m"]: if not left_unique: + sample = sample_duplicates(left_keys) raise MergeError( "Merge keys are not unique in left dataset; not a one-to-many merge" + f"Offending keys (sample): {sample}" + ) elif validate in ["many_to_one", "m:1"]: if not right_unique: + sample = sample_duplicates(right_keys) raise MergeError( "Merge keys are not unique in right dataset; " "not a many-to-one merge" + f"Offending keys (sample): {sample}" ) elif validate in ["many_to_many", "m:m"]: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c38ee32cb7226..e61d3d3bcffdb 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3121,3 +3121,8 @@ def test_merge_pyarrow_datetime_duplicates(): ) expected = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_frame_equal(result, expected) + +def test_merge_validate_one_to_one_offending_keys(): + df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}) + with pytest.raises(pd.errors.MergeError, match="Offending keys"): + df.merge(df, on="a", validate="one_to_one") From 80c9afad1d9afaafd12b09f1c88140a84effe032 Mon Sep 17 00:00:00 2001 From: anubhav-0012 Date: Sat, 25 Oct 2025 11:30:37 +0530 Subject: [PATCH 2/4] remove unnecessary index conversion --- pandas/core/reshape/merge.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c5e147048218a..463d9bca3a8d5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1968,7 +1968,6 @@ def _validate_validate_kwd(self, validate: str) -> None: def sample_duplicates(keys, limit=10): """Return up to 'limit' unique duplicate keys.""" - keys = Index(keys) dups = keys[keys.duplicated()] if not len(dups): return [] @@ -1977,23 +1976,25 @@ def sample_duplicates(keys, limit=10): # Check data integrity if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: - combined_keys = list(left_keys.append(right_keys)) - sample = sample_duplicates(combined_keys, limit=10) + left_sample = sample_duplicates(left_keys, limit=5) + right_sample = sample_duplicates(right_keys, limit=5) + raise MergeError( "Merge keys are not unique in either left " - "or right dataset; not a one-to-one merge" - f"Offending keys (sample): {sample}" + "or right dataset; not a one-to-one merge. " + f"Offending keys in left dataset (sample): {left_sample} " + f"Offending keys in right dataset (sample): {right_sample} " ) if not left_unique: sample = sample_duplicates(left_keys) raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-one merge" + "Merge keys are not unique in left dataset; not a one-to-one merge. " f"Offending keys (sample): {sample}" ) if not right_unique: sample = sample_duplicates(right_keys) raise MergeError( - "Merge keys are not unique in right dataset; not a one-to-one merge" + "Merge keys are not unique in right dataset; not a one-to-one merge. " f"Offending keys (sample): {sample}" ) @@ -2001,7 +2002,7 @@ def sample_duplicates(keys, limit=10): if not left_unique: sample = sample_duplicates(left_keys) raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-many merge" + "Merge keys are not unique in left dataset; not a one-to-many merge. " f"Offending keys (sample): {sample}" ) @@ -2011,7 +2012,7 @@ def sample_duplicates(keys, limit=10): sample = sample_duplicates(right_keys) raise MergeError( "Merge keys are not unique in right dataset; " - "not a many-to-one merge" + "not a many-to-one merge. " f"Offending keys (sample): {sample}" ) From 9888c41b5be304da4115514458c7b48a2ab30346 Mon Sep 17 00:00:00 2001 From: anubhav-0012 Date: Sat, 25 Oct 2025 18:05:27 +0530 Subject: [PATCH 3/4] Fix pre-commit formatting --- pandas/core/reshape/merge.py | 14 ++++++++------ pandas/tests/reshape/merge/test_merge.py | 1 + 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 463d9bca3a8d5..a66f52d45b761 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1962,7 +1962,7 @@ def _validate_validate_kwd(self, validate: str) -> None: right_keys = self.orig_right.index else: right_keys = MultiIndex.from_arrays(self.right_join_keys) - + left_unique = left_keys.is_unique right_unique = right_keys.is_unique @@ -1978,7 +1978,7 @@ def sample_duplicates(keys, limit=10): if not left_unique and not right_unique: left_sample = sample_duplicates(left_keys, limit=5) right_sample = sample_duplicates(right_keys, limit=5) - + raise MergeError( "Merge keys are not unique in either left " "or right dataset; not a one-to-one merge. " @@ -1988,13 +1988,15 @@ def sample_duplicates(keys, limit=10): if not left_unique: sample = sample_duplicates(left_keys) raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-one merge. " + "Merge keys are not unique in left dataset; " + "not a one-to-one merge. " f"Offending keys (sample): {sample}" ) if not right_unique: sample = sample_duplicates(right_keys) raise MergeError( - "Merge keys are not unique in right dataset; not a one-to-one merge. " + "Merge keys are not unique in right dataset; " + "not a one-to-one merge. " f"Offending keys (sample): {sample}" ) @@ -2002,9 +2004,9 @@ def sample_duplicates(keys, limit=10): if not left_unique: sample = sample_duplicates(left_keys) raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-many merge. " + "Merge keys are not unique in left dataset; " + "not a one-to-many merge. " f"Offending keys (sample): {sample}" - ) elif validate in ["many_to_one", "m:1"]: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e61d3d3bcffdb..7df6856bbf431 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3122,6 +3122,7 @@ def test_merge_pyarrow_datetime_duplicates(): expected = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_frame_equal(result, expected) + def test_merge_validate_one_to_one_offending_keys(): df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}) with pytest.raises(pd.errors.MergeError, match="Offending keys"): From 58d76894a027ff7d87426684716b93d794d13b83 Mon Sep 17 00:00:00 2001 From: anubhav-0012 Date: Sat, 25 Oct 2025 18:22:25 +0530 Subject: [PATCH 4/4] replace pd.DataFrame with DataFrame --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7df6856bbf431..e940aba9f4018 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3124,6 +3124,6 @@ def test_merge_pyarrow_datetime_duplicates(): def test_merge_validate_one_to_one_offending_keys(): - df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}) + df = DataFrame({"a": [1, 1, 3], "b": [4, 5, 6]}) with pytest.raises(pd.errors.MergeError, match="Offending keys"): df.merge(df, on="a", validate="one_to_one")