From 9c93e036d5a052c2eeb994fd9c427fba44699068 Mon Sep 17 00:00:00 2001
From: Marie <marie.sacksick@posteo.net>
Date: Sat, 28 Mar 2026 15:53:50 +0100
Subject: [PATCH 1/7] fix: improve the error message provided by deduplicate

---
 skrub/_deduplicate.py           | 33 +++++++++++++++++++++++---------
 skrub/tests/test_deduplicate.py | 34 +++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py
index 7e62d8097..d0528568f 100644
--- a/skrub/_deduplicate.py
+++ b/skrub/_deduplicate.py
@@ -2,6 +2,8 @@
 Implements deduplication based on clustering string distance matrices.
 """
 
+import warnings
+
 import numpy as np
 import pandas as pd
 from joblib import Parallel, delayed
@@ -75,7 +77,7 @@ def _guess_clusters(Z, distance_mat, n_jobs=None):
         number of clusters that maximize the silhouette score.
     """
     max_clusters = Z.shape[0]
-    n_clusters = np.arange(2, max_clusters)
+    n_clusters = np.arange(1, max_clusters)
     # silhouette score needs a redundant distance matrix
     redundant_dist = squareform(distance_mat)
     silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")(
@@ -136,6 +138,7 @@ def deduplicate(
     analyzer="char_wb",
     linkage_method="average",
     n_jobs=None,
+    warn=False,
 ):
     """Deduplicate categorical data by hierarchically clustering similar strings.
 
@@ -168,6 +171,9 @@ def deduplicate(
         average distance between data points in the first and second cluster.
     n_jobs : int, default=None
         The number of jobs to run in parallel.
+    warn : bool, default=False
+        If True, emit a warning when clustering fails (e.g. too few or too
+        similar entries) and the input is returned unchanged.
 
     Returns
     -------
@@ -260,14 +266,23 @@ def deduplicate(
     9  white      9              white
     """
     unique_words, counts = np.unique(X, return_counts=True)
-    distance_mat = _compute_ngram_distance(
-        unique_words, ngram_range=ngram_range, analyzer=analyzer
-    )
-
-    Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True)
-    if n_clusters is None:
-        n_clusters = _guess_clusters(Z, distance_mat, n_jobs)
-    clusters = fcluster(Z, n_clusters, criterion="maxclust")
+    try:
+        distance_mat = _compute_ngram_distance(
+            unique_words, ngram_range=ngram_range, analyzer=analyzer
+        )
+        Z = linkage(distance_mat, method=linkage_method, optimal_ordering=True)
+        if n_clusters is None:
+            n_clusters = _guess_clusters(Z, distance_mat, n_jobs)
+        clusters = fcluster(Z, n_clusters, criterion="maxclust")
+    except Exception:
+        if warn:
+            warnings.warn(
+                "Deduplication could not cluster the data (too few or too similar"
+                " entries). Returning the input unchanged.",
+                UserWarning,
+                stacklevel=2,
+            )
+        return list(X)
 
     translation_table = _create_spelling_correction(unique_words, counts, clusters)
     unrolled_corrections = translation_table[X]
diff --git a/skrub/tests/test_deduplicate.py b/skrub/tests/test_deduplicate.py
index 72309c4d4..53f49b036 100644
--- a/skrub/tests/test_deduplicate.py
+++ b/skrub/tests/test_deduplicate.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import warnings
 from scipy.cluster.hierarchy import linkage
 from scipy.spatial.distance import squareform
 from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel
@@ -145,6 +146,39 @@ def start_call(self):
 joblib.register_parallel_backend("testing", DummyBackend)
 
 
+@pytest.mark.parametrize(
+    "X",
+    [
+        # Too few unique entries for silhouette score (only 2 unique values)
+        ["black", "black", "black", "blac"],
+        # Too few unique entries (3 unique values, only 1 cluster possible)
+        ["black", "white", "black", "black", "blac"],
+        # 4 unique values but still not enough for clustering to succeed
+        ["black", "black", "black", "black", "white", "white", "white", "red", "green"],
+    ],
+)
+def test_deduplicate_failure_returns_input(X):
+    result = deduplicate(X)
+    assert isinstance(result, list)
+    assert result == X
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        ["black", "black", "black", "blac"],
+        ["black", "white", "black", "black", "blac"],
+    ],
+)
+def test_deduplicate_warn(X):
+    with pytest.warns(UserWarning, match="Returning the input unchanged"):
+        deduplicate(X, warn=True)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        deduplicate(X, warn=False)  # raises if any warning is emitted
+
+
 @skip_if_no_parallel
 def test_backend_respected():
     """

From 01756e7b47bf01deffea1edada78c7d2cd820fe8 Mon Sep 17 00:00:00 2001
From: Marie <marie.sacksick@posteo.net>
Date: Sat, 28 Mar 2026 18:01:02 +0100
Subject: [PATCH 2/7] add changelog

---
 CHANGES.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 6a272b69c..7595553a9 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -31,9 +31,12 @@ Changes
   supported by :class:`~sklearn.neighbors.NearestNeighbors` (see its docstring).
   :pr:`1861` by :user:`Saba Siddique <sabasiddique1>`.
 
+
 Bugfixes
 --------
-
+- :meth:`deduplicate` now proceeds even though the clustering of the strings fails, 
+  with a possibility to display the warnings. 
+  :pr:`1996` by :user:`Marie Sacksick <MarieSacksick>`.
 
 Deprecations
 ------------

From 1b7d11aa2af62c4dbd158214f6701546b2590a14 Mon Sep 17 00:00:00 2001
From: Marie <marie.sacksick@posteo.net>
Date: Sat, 28 Mar 2026 18:01:15 +0100
Subject: [PATCH 3/7] pre-commit

---
 skrub/tests/test_deduplicate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/skrub/tests/test_deduplicate.py b/skrub/tests/test_deduplicate.py
index 53f49b036..ca4bbafb9 100644
--- a/skrub/tests/test_deduplicate.py
+++ b/skrub/tests/test_deduplicate.py
@@ -1,8 +1,9 @@
+import warnings
+
 import joblib
 import numpy as np
 import pandas as pd
 import pytest
-import warnings
 from scipy.cluster.hierarchy import linkage
 from scipy.spatial.distance import squareform
 from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel

From f2a7d6a8e2e4b385e97cb9b10bd6e3387020d504 Mon Sep 17 00:00:00 2001
From: Marie <marie.sacksick@posteo.net>
Date: Sat, 25 Apr 2026 13:14:09 +0200
Subject: [PATCH 4/7] run pre commits

---
 CHANGES.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 7595553a9..3c641edfe 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -34,8 +34,8 @@ Changes
 
 Bugfixes
 --------
-- :meth:`deduplicate` now proceeds even though the clustering of the strings fails, 
-  with a possibility to display the warnings. 
+- :meth:`deduplicate` now proceeds even though the clustering of the strings fails,
+  with a possibility to display the warnings.
   :pr:`1996` by :user:`Marie Sacksick <MarieSacksick>`.
 
 Deprecations

From e62a174cc592b5dd0d430639c013fec357c9f914 Mon Sep 17 00:00:00 2001
From: Marie <marie.sacksick@posteo.net>
Date: Sat, 25 Apr 2026 19:55:42 +0200
Subject: [PATCH 5/7] cancel check

---
 skrub/_deduplicate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py
index d0528568f..6dd5d91cf 100644
--- a/skrub/_deduplicate.py
+++ b/skrub/_deduplicate.py
@@ -77,7 +77,7 @@ def _guess_clusters(Z, distance_mat, n_jobs=None):
         number of clusters that maximize the silhouette score.
     """
     max_clusters = Z.shape[0]
-    n_clusters = np.arange(1, max_clusters)
+    n_clusters = np.arange(2, max_clusters)
     # silhouette score needs a redundant distance matrix
     redundant_dist = squareform(distance_mat)
     silhouette_scores = Parallel(n_jobs=n_jobs, prefer="processes")(

From 97ee3c25fd6358ad89625ca4db9c7e84865bff8e Mon Sep 17 00:00:00 2001
From: Marie <marie.sacksick@posteo.net>
Date: Sat, 25 Apr 2026 21:47:03 +0200
Subject: [PATCH 6/7] simplify claude code to match code style of the project

---
 skrub/_deduplicate.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py
index 6dd5d91cf..ea1b153a1 100644
--- a/skrub/_deduplicate.py
+++ b/skrub/_deduplicate.py
@@ -278,9 +278,7 @@ def deduplicate(
         if warn:
             warnings.warn(
                 "Deduplication could not cluster the data (too few or too similar"
-                " entries). Returning the input unchanged.",
-                UserWarning,
-                stacklevel=2,
+                " entries). Returning the input unchanged."
             )
         return list(X)
 

From a181fe8d727ce634c9a0c0108e79f9397e7bf052 Mon Sep 17 00:00:00 2001
From: Marie Sacksick <79304610+MarieSacksick@users.noreply.github.com>
Date: Mon, 11 May 2026 19:19:52 +0200
Subject: [PATCH 7/7] Update skrub/_deduplicate.py

Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
---
 skrub/_deduplicate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py
index c59b84817..cb41be3c4 100644
--- a/skrub/_deduplicate.py
+++ b/skrub/_deduplicate.py
@@ -275,7 +275,7 @@ def deduplicate(
         if n_clusters is None:
             n_clusters = _guess_clusters(Z, distance_mat, n_jobs)
         clusters = fcluster(Z, n_clusters, criterion="maxclust")
-    except Exception:
+    except ValueError:
         if warn:
             warnings.warn(
                 "Deduplication could not cluster the data (too few or too similar"