skrub-data · rcap107 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -130,6 +130,9 @@ Changes
   <jeromedockes>`.
 - The ``exclude_cols`` of :meth:`DataOp.skb.apply` can now be a DataOp.
   :pr:`2050` by :user:`Jérôme Dockès <jeromedockes>`.
+- Skrub estimators now correctly show links to the documentation in the HTML
+  representation that is generated for notebooks. :pr:`2036` by :user:`Riccardo
+  Cappuzzo <rcap107>`.
 
 Bugfixes
 --------

diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py
@@ -3,17 +3,19 @@
 based on the type of the transformer passed to it.
 """
 
-from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
+from sklearn.base import TransformerMixin, check_is_fitted
 
 from . import selectors
 from ._apply_to_each_col import ApplyToEachCol
 from ._apply_to_sub_frame import ApplyToSubFrame
+from ._base import BaseTransformer
+from ._sklearn_compat import _VisualBlock
 from ._wrap_transformer import wrap_transformer
 
 _SELECT_ALL_COLUMNS = selectors.all()
 
 
-class ApplyToCols(TransformerMixin, BaseEstimator):
+class ApplyToCols(TransformerMixin, BaseTransformer):
     """
     Apply a transformer to selected columns in a dataframe.
 
@@ -431,6 +433,18 @@ def get_feature_names_out(self, input_features=None):
 
         return self._wrapped_transformer.get_feature_names_out(input_features)
 
+    def _sk_visual_block_(self):
+        # This is needed because when ApplyToCols is used with a transformer like
+        # TableVectorizer then the estimator is shown as a parallel block, which
+        # would not add the documentation link.
+        # With this override the problem is fixed.
+        return _VisualBlock(
+            "serial",
+            [self.transformer],
+            names=[self.transformer.__class__.__name__],
+            name_details=[str(self.transformer)],
+        )
+
     def __getattr__(self, name):
         if name == "transformers_" and isinstance(
             getattr(self, "_wrapped_transformer", None), ApplyToSubFrame

diff --git a/skrub/_base.py b/skrub/_base.py
@@ -0,0 +1,17 @@
+from sklearn.base import BaseEstimator
+
+
+class BaseTransformer(BaseEstimator):
+    _doc_link_module = "skrub"
+
+    # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin,
+    # which also defines _doc_link_template as a property, and we want to be able
+    # to override it.
+    @property
+    def _doc_link_template(self):
+        return getattr(
+            self,
+            "__doc_link_template",
+            "https://skrub-data.org/stable/reference/generated/"
+            "{estimator_module}.{estimator_name}.html",
+        )
diff --git a/skrub/_data_ops/_estimator.py b/skrub/_data_ops/_estimator.py
@@ -787,6 +787,17 @@ def describe_params(self):
         """
         return describe_params(eval_choices(self.data_op), choice_graph(self.data_op))
 
+    _doc_link_module = "skrub"
+
+    @property
+    def _doc_link_template(self):
+        return getattr(
+            self,
+            "__doc_link_template",
+            "https://skrub-data.org/stable/reference/generated/"
+            "{estimator_module}.{estimator_name}.html",
+        )
+
 
 def _to_Xy_pipeline(learner, environment):
     return learner.__skrub_to_Xy_pipeline__(environment)

diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py
@@ -1,10 +1,11 @@
-from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
+from sklearn.base import TransformerMixin, check_is_fitted
 
 from . import selectors as s
+from ._base import BaseTransformer
 from ._single_column_transformer import SingleColumnTransformer
 
 
-class SelectCols(TransformerMixin, BaseEstimator):
+class SelectCols(TransformerMixin, BaseTransformer):
     """Select a subset of a DataFrame's columns.
 
     A ``ValueError`` is raised if any of the provided column names are not in the
@@ -99,7 +100,7 @@ def get_feature_names_out(self, input_features=None):
         return self.columns_
 
 
-class DropCols(TransformerMixin, BaseEstimator):
+class DropCols(TransformerMixin, BaseTransformer):
     """Drop a subset of a DataFrame's columns.
 
     The other columns are kept in their original order. A ``ValueError`` is raised if

diff --git a/skrub/_single_column_transformer.py b/skrub/_single_column_transformer.py
@@ -4,12 +4,12 @@
 import re
 import textwrap
 
-from sklearn.base import BaseEstimator
 from sklearn.pipeline import Pipeline
 from sklearn.utils.validation import check_is_fitted
 
 from . import _dataframe as sbd
 from . import _utils
+from ._base import BaseTransformer
 
 __all__ = ["SingleColumnTransformer", "RejectColumn"]
 
@@ -120,7 +120,7 @@ class RejectColumn(ValueError):
     pass
 
 
-class SingleColumnTransformer(BaseEstimator):
+class SingleColumnTransformer(BaseTransformer):
     """Base class for single-column transformers.
 
     Such transformers are applied independently to each column by

diff --git a/skrub/_squashing_scaler.py b/skrub/_squashing_scaler.py
@@ -5,6 +5,7 @@
 from sklearn.preprocessing import RobustScaler
 from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
 
+from skrub._base import BaseTransformer
 from skrub._sklearn_compat import validate_data
 
 
@@ -82,7 +83,7 @@ def transform(self, X):
         return self.scale_ * (X - self.median_)
 
 
-class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseTransformer):
     r"""Perform robust centering and scaling followed by soft clipping.
 
     When features have large outliers, smooth clipping prevents the outliers from

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
@@ -4,14 +4,15 @@
 from collections.abc import Iterable
 
 import numpy as np
-from sklearn.base import BaseEstimator, TransformerMixin, clone
+from sklearn.base import TransformerMixin, clone
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils.validation import check_is_fitted
 
 from . import _dataframe as sbd
 from . import _utils
 from . import selectors as s
+from ._base import BaseTransformer
 from ._check_input import CheckInputDataFrame
 from ._clean_categories import CleanCategories
 from ._clean_null_strings import CleanNullStrings
@@ -31,6 +32,8 @@
 
 
 class PassThrough(SingleColumnTransformer):
+    _doc_link_module = ""
+
     def fit_transform(self, column, y=None):
         return column
 
@@ -181,7 +184,7 @@ def _get_preprocessors(
     return steps
 
 
-class Cleaner(TransformerMixin, BaseEstimator):
+class Cleaner(TransformerMixin, BaseTransformer):
     """Column-wise consistency checks and sanitization of dtypes, null values and dates.
 
     The ``Cleaner`` performs some consistency checks and basic preprocessing
@@ -539,7 +542,7 @@ def get_feature_names_out(self, input_features=None):
         return np.asarray(self.all_outputs_)
 
 
-class TableVectorizer(TransformerMixin, BaseEstimator):
+class TableVectorizer(TransformerMixin, BaseTransformer):
     """Transform a dataframe to a numeric (vectorized) representation.
 
     This transformer preprocesses the given dataframe by first cleaning the data

diff --git a/skrub/tests/test_base.py b/skrub/tests/test_base.py
@@ -0,0 +1,53 @@
+import re
+
+from sklearn.utils import estimator_html_repr
+
+from skrub import (
+    ApplyToCols,
+    Cleaner,
+    DropCols,
+    SelectCols,
+    StringEncoder,
+    TableVectorizer,
+)
+
+
+def test_doc_link_apply_to_cols():
+    """The wrapped transformer's doc link appears in the HTML repr of ApplyToCols."""
+    html = estimator_html_repr(ApplyToCols(StringEncoder()))
+    links = set(re.findall(r'href="(https?://[^#"]+)"', html))
+    assert (
+        "https://skrub-data.org/stable/reference/generated/skrub.ApplyToCols.html"
+        in links
+    )
+
+    html = estimator_html_repr(ApplyToCols(TableVectorizer()))
+    links = set(re.findall(r'href="(https?://[^#"]+)"', html))
+    assert (
+        "https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html"
+        in links
+    )
+
+
+def test_doc_link_skrub_class_select_cols():
+    """Public skrub classes get a link to skrub documentation."""
+    link = SelectCols(cols=[])._get_doc_link()
+    assert link == (
+        "https://skrub-data.org/stable/reference/generated/skrub.SelectCols.html"
+    )
+    link = DropCols(cols=[])._get_doc_link()
+    assert link == (
+        "https://skrub-data.org/stable/reference/generated/skrub.DropCols.html"
+    )
+
+
+def test_doc_link_table_vectorizer():
+    """Public skrub classes get a link to skrub documentation."""
+    link = TableVectorizer()._get_doc_link()
+    assert link == (
+        "https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html"
+    )
+    link = Cleaner()._get_doc_link()
+    assert link == (
+        "https://skrub-data.org/stable/reference/generated/skrub.Cleaner.html"
+    )
diff --git a/skrub/tests/test_single_column_transformer.py b/skrub/tests/test_single_column_transformer.py
@@ -3,6 +3,7 @@
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.preprocessing import StandardScaler
 
+from skrub import GapEncoder
 from skrub import _dataframe as sbd
 from skrub._single_column_transformer import (
     SingleColumnTransformer,
@@ -91,6 +92,32 @@ def fit(self, column, y=None):
     assert transformer.get_feature_names_out() == [sbd.name(column)]
 
 
+def test_doc_link_skrub_class():
+    """Public skrub classes get a link to skrub documentation."""
+    link = GapEncoder()._get_doc_link()
+    assert link == (
+        "https://skrub-data.org/stable/reference/generated/skrub.GapEncoder.html"
+    )
+
+
+def test_doc_link_user_defined_subclass():
+    """User-defined subclasses outside skrub.* produce no link."""
+
+    class MyTransformer(SingleColumnTransformer):
+        def fit_transform(self, column, y=None):
+            return column
+
+        def transform(self, column):
+            return column
+
+    # Needed to simulate a user-defined class outside of skrub.*.
+    # Since this test is running in a module named
+    # "skrub.tests.test_single_column_transformer", that is the default modulee
+    # for MyTransformer, which would cause a doc link to be generated.
+    MyTransformer.__module__ = "user_package"
+    assert MyTransformer()._get_doc_link() == ""
+
+
 def test_is_single_column_transformer():
     class S:
         __single_column_transformer__ = True