From 9c93e036d5a052c2eeb994fd9c427fba44699068 Mon Sep 17 00:00:00 2001 From: Marie Date: Sat, 28 Mar 2026 15:53:50 +0100 Subject: [PATCH 1/7] fix: improve the error message provided by deduplicate --- skrub/_deduplicate.py | 33 +++++++++++++++++++++++--------- skrub/tests/test_deduplicate.py | 34 +++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py index 7e62d8097..d0528568f 100644 --- a/skrub/_deduplicate.py +++ b/skrub/_deduplicate.py @@ -2,6 +2,8 @@ Implements deduplication based on clustering string distance matrices. """ +import warnings + import numpy as np import pandas as pd from joblib import Parallel, delayed @@ -75,7 +77,7 @@ def _guess_clusters(Z, distance_mat, n_jobs=None): number of clusters that maximize the silhouette score. """ max_clusters = Z.shape[0] - n_clusters = np.arange(2, max_clusters) + n_clusters = np.arange(1, max_clusters) # silhouette score needs a redundant distance matrix redundant_dist = squareform(distance_mat) silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")( @@ -136,6 +138,7 @@ def deduplicate( analyzer="char_wb", linkage_method="average", n_jobs=None, + warn=False, ): """Deduplicate categorical data by hierarchically clustering similar strings. @@ -168,6 +171,9 @@ def deduplicate( average distance between data points in the first and second cluster. n_jobs : int, default=None The number of jobs to run in parallel. + warn : bool, default=False + If True, emit a warning when clustering fails (e.g. too few or too + similar entries) and the input is returned unchanged. Returns ------- @@ -260,14 +266,23 @@ def deduplicate( 9 white 9 white """ unique_words, counts = np.unique(X, return_counts=True) - distance_mat = _compute_ngram_distance( - unique_words, ngram_range=ngram_range, analyzer=analyzer - ) - - Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True) - if n_clusters is None: - n_clusters = _guess_clusters(Z, distance_mat, n_jobs) - clusters = fcluster(Z, n_clusters, criterion="maxclust") + try: + distance_mat = _compute_ngram_distance( + unique_words, ngram_range=ngram_range, analyzer=analyzer + ) + Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True) + if n_clusters is None: + n_clusters = _guess_clusters(Z, distance_mat, n_jobs) + clusters = fcluster(Z, n_clusters, criterion="maxclust") + except Exception: + if warn: + warnings.warn( + "Deduplication could not cluster the data (too few or too similar" + " entries). Returning the input unchanged.", + UserWarning, + stacklevel=2, + ) + return list(X) translation_table = _create_spelling_correction(unique_words, counts, clusters) unrolled_corrections = translation_table[X] diff --git a/skrub/tests/test_deduplicate.py b/skrub/tests/test_deduplicate.py index 72309c4d4..53f49b036 100644 --- a/skrub/tests/test_deduplicate.py +++ b/skrub/tests/test_deduplicate.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import pytest +import warnings from scipy.cluster.hierarchy import linkage from scipy.spatial.distance import squareform from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel @@ -145,6 +146,39 @@ def start_call(self): joblib.register_parallel_backend("testing", DummyBackend) +@pytest.mark.parametrize( + "X", + [ + # Too few unique entries for silhouette score (only 2 unique values) + ["black", "black", "black", "blac"], + # Too few unique entries (3 unique values, only 1 cluster possible) + ["black", "white", "black", "black", "blac"], + # 4 unique values but still not enough for clustering to succeed + ["black", "black", "black", "black", "white", "white", "white", "red", "green"], + ], +) +def test_deduplicate_failure_returns_input(X): + result = deduplicate(X) + assert isinstance(result, list) + assert result == X + + +@pytest.mark.parametrize( + "X", + [ + ["black", "black", "black", "blac"], + ["black", "white", "black", "black", "blac"], + ], +) +def test_deduplicate_warn(X): + with pytest.warns(UserWarning, match="Returning the input unchanged"): + deduplicate(X, warn=True) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + deduplicate(X, warn=False) # raises if any warning is emitted + + @skip_if_no_parallel def test_backend_respected(): """ From 01756e7b47bf01deffea1edada78c7d2cd820fe8 Mon Sep 17 00:00:00 2001 From: Marie Date: Sat, 28 Mar 2026 18:01:02 +0100 Subject: [PATCH 2/7] add changelog --- CHANGES.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 6a272b69c..7595553a9 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -31,9 +31,12 @@ Changes supported by :class:`~sklearn.neighbors.NearestNeighbors` (see its docstring). :pr:`1861` by :user:`Saba Siddique `. + Bugfixes -------- - +- :meth:`deduplicate` now proceeds even though the clustering of the strings fails, + with a possibility to display the warnings. + :pr:`1996` by :user:`Marie Sacksick `. Deprecations ------------ From 1b7d11aa2af62c4dbd158214f6701546b2590a14 Mon Sep 17 00:00:00 2001 From: Marie Date: Sat, 28 Mar 2026 18:01:15 +0100 Subject: [PATCH 3/7] pre-commit --- skrub/tests/test_deduplicate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skrub/tests/test_deduplicate.py b/skrub/tests/test_deduplicate.py index 53f49b036..ca4bbafb9 100644 --- a/skrub/tests/test_deduplicate.py +++ b/skrub/tests/test_deduplicate.py @@ -1,8 +1,9 @@ +import warnings + import joblib import numpy as np import pandas as pd import pytest -import warnings from scipy.cluster.hierarchy import linkage from scipy.spatial.distance import squareform from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel From f2a7d6a8e2e4b385e97cb9b10bd6e3387020d504 Mon Sep 17 00:00:00 2001 From: Marie Date: Sat, 25 Apr 2026 13:14:09 +0200 Subject: [PATCH 4/7] run pre commits --- CHANGES.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 7595553a9..3c641edfe 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -34,8 +34,8 @@ Changes Bugfixes -------- -- :meth:`deduplicate` now proceeds even though the clustering of the strings fails, - with a possibility to display the warnings. +- :meth:`deduplicate` now proceeds even though the clustering of the strings fails, + with a possibility to display the warnings. :pr:`1996` by :user:`Marie Sacksick `. Deprecations From e62a174cc592b5dd0d430639c013fec357c9f914 Mon Sep 17 00:00:00 2001 From: Marie Date: Sat, 25 Apr 2026 19:55:42 +0200 Subject: [PATCH 5/7] cancel check --- skrub/_deduplicate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py index d0528568f..6dd5d91cf 100644 --- a/skrub/_deduplicate.py +++ b/skrub/_deduplicate.py @@ -77,7 +77,7 @@ def _guess_clusters(Z, distance_mat, n_jobs=None): number of clusters that maximize the silhouette score. """ max_clusters = Z.shape[0] - n_clusters = np.arange(1, max_clusters) + n_clusters = np.arange(2, max_clusters) # silhouette score needs a redundant distance matrix redundant_dist = squareform(distance_mat) silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")( From 97ee3c25fd6358ad89625ca4db9c7e84865bff8e Mon Sep 17 00:00:00 2001 From: Marie Date: Sat, 25 Apr 2026 21:47:03 +0200 Subject: [PATCH 6/7] simplify claude code to match code style of the project --- skrub/_deduplicate.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py index 6dd5d91cf..ea1b153a1 100644 --- a/skrub/_deduplicate.py +++ b/skrub/_deduplicate.py @@ -278,9 +278,7 @@ def deduplicate( if warn: warnings.warn( "Deduplication could not cluster the data (too few or too similar" - " entries). Returning the input unchanged.", - UserWarning, - stacklevel=2, + " entries). Returning the input unchanged." ) return list(X) From a181fe8d727ce634c9a0c0108e79f9397e7bf052 Mon Sep 17 00:00:00 2001 From: Marie Sacksick <79304610+MarieSacksick@users.noreply.github.com> Date: Mon, 11 May 2026 19:19:52 +0200 Subject: [PATCH 7/7] Update skrub/_deduplicate.py Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> --- skrub/_deduplicate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py index c59b84817..cb41be3c4 100644 --- a/skrub/_deduplicate.py +++ b/skrub/_deduplicate.py @@ -275,7 +275,7 @@ def deduplicate( if n_clusters is None: n_clusters = _guess_clusters(Z, distance_mat, n_jobs) clusters = fcluster(Z, n_clusters, criterion="maxclust") - except Exception: + except ValueError: if warn: warnings.warn( "Deduplication could not cluster the data (too few or too similar"