Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
5fcf629
ENH - adding doc link to html repr of estimators
rcap107 Apr 21, 2026
baf63fb
moving the new methods
rcap107 Apr 21, 2026
0fde8fd
adding more
rcap107 Apr 21, 2026
e40ecfa
fixing applytocols
rcap107 Apr 21, 2026
3156828
fixing typo
rcap107 Apr 21, 2026
8213776
adding tests
rcap107 Apr 21, 2026
4a41b3f
adding a comment
rcap107 Apr 21, 2026
b3403c0
changelog
rcap107 Apr 21, 2026
ade23b8
removing unneeded setter
rcap107 Apr 21, 2026
fe0fc13
adding more tests for coverage
rcap107 Apr 22, 2026
7f774ea
Merge branch 'main' into enh-add-doc-link-to-estimator
rcap107 Apr 23, 2026
cb48d16
Merge remote-tracking branch 'upstream/HEAD' into enh-add-doc-link-to…
rcap107 Jun 8, 2026
c2eb8b5
Merge branch 'enh-add-doc-link-to-estimator' of github.com:rcap107/sk…
rcap107 Jun 8, 2026
1f1a828
moving changes to a single file
rcap107 Jun 8, 2026
79cc5e9
_
rcap107 Jun 8, 2026
48949d2
tests
rcap107 Jun 8, 2026
f1f07d9
removing unnecessary code
rcap107 Jun 8, 2026
ea59f93
Merge remote-tracking branch 'upstream/HEAD' into enh-add-doc-link-to…
rcap107 Jun 16, 2026
a1f520f
addressing comments from review
rcap107 Jun 16, 2026
20702bd
bringing back code block and better comment
rcap107 Jun 16, 2026
ce9dc05
addressing all missing files
rcap107 Jun 16, 2026
0fe8479
fixing relative imports
rcap107 Jun 16, 2026
1752ffa
renaming to estimator, fixing order, adding to paramsearch
rcap107 Jun 16, 2026
e4458b4
Apply suggestion from @jeromedockes
rcap107 Jun 16, 2026
d601e99
Merge branch 'enh-add-doc-link-to-estimator' of github.com:rcap107/sk…
rcap107 Jun 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ Changes
<jeromedockes>`.
- The ``exclude_cols`` of :meth:`DataOp.skb.apply` can now be a DataOp.
:pr:`2050` by :user:`Jérôme Dockès <jeromedockes>`.
- Skrub estimators now correctly show links to the documentation in the HTML
representation that is generated for notebooks. :pr:`2036` by :user:`Riccardo
Cappuzzo <rcap107>`.

Bugfixes
--------
Expand Down
18 changes: 16 additions & 2 deletions skrub/_apply_to_cols.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
based on the type of the transformer passed to it.
"""

from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
from sklearn.base import TransformerMixin, check_is_fitted

from . import selectors
from ._apply_to_each_col import ApplyToEachCol
from ._apply_to_sub_frame import ApplyToSubFrame
from ._base import BaseTransformer
from ._sklearn_compat import _VisualBlock
from ._wrap_transformer import wrap_transformer

_SELECT_ALL_COLUMNS = selectors.all()


class ApplyToCols(TransformerMixin, BaseEstimator):
class ApplyToCols(TransformerMixin, BaseTransformer):
"""
Apply a transformer to selected columns in a dataframe.

Expand Down Expand Up @@ -431,6 +433,18 @@ def get_feature_names_out(self, input_features=None):

return self._wrapped_transformer.get_feature_names_out(input_features)

def _sk_visual_block_(self):
# This is needed because when ApplyToCols is used with a transformer like

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm not sure i understood this comment, but also the scikit-learn diagram machinery is quite complicated so maybe it's not easy to explain in a short comment

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I added this block I had the problem that for some reason the (?) signs did not appear properly for the TableVectorizer, but now I can't replicate the problem anymore so I think it can be removed

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also tested sklearn 1.5, but it seems to work on the old version so not sure what happened

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevermind, I found the problem. Without that block, the docstring for the TableVectorizer itself is not added.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without this function, the transformer inside applytocols doesn't get the doc link

Image

with the change, it shows up properly

Image

# TableVectorizer then the estimator is shown as a parallel block, which
# would not add the documentation link.
# With this override the problem is fixed.
return _VisualBlock(
"serial",
[self.transformer],
names=[self.transformer.__class__.__name__],
name_details=[str(self.transformer)],
)

def __getattr__(self, name):
if name == "transformers_" and isinstance(
getattr(self, "_wrapped_transformer", None), ApplyToSubFrame
Expand Down
17 changes: 17 additions & 0 deletions skrub/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from sklearn.base import BaseEstimator


class BaseTransformer(BaseEstimator):
Comment thread
rcap107 marked this conversation as resolved.
Outdated
_doc_link_module = "skrub"

# Defining this as a property because it inherits from _HTMLDocumentationLinkMixin,
# which also defines _doc_link_template as a property, and we want to be able
# to override it.
@property
def _doc_link_template(self):
return getattr(
self,
"__doc_link_template",
"https://skrub-data.org/stable/reference/generated/"
"{estimator_module}.{estimator_name}.html",
)
11 changes: 11 additions & 0 deletions skrub/_data_ops/_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,17 @@ def describe_params(self):
"""
return describe_params(eval_choices(self.data_op), choice_graph(self.data_op))

_doc_link_module = "skrub"
Comment thread
rcap107 marked this conversation as resolved.
Outdated

@property
def _doc_link_template(self):
return getattr(
self,
"__doc_link_template",
"https://skrub-data.org/stable/reference/generated/"
"{estimator_module}.{estimator_name}.html",
)


def _to_Xy_pipeline(learner, environment):
return learner.__skrub_to_Xy_pipeline__(environment)
Expand Down
7 changes: 4 additions & 3 deletions skrub/_select_cols.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
from sklearn.base import TransformerMixin, check_is_fitted

from . import selectors as s
from ._base import BaseTransformer
from ._single_column_transformer import SingleColumnTransformer


class SelectCols(TransformerMixin, BaseEstimator):
class SelectCols(TransformerMixin, BaseTransformer):
"""Select a subset of a DataFrame's columns.

A ``ValueError`` is raised if any of the provided column names are not in the
Expand Down Expand Up @@ -99,7 +100,7 @@ def get_feature_names_out(self, input_features=None):
return self.columns_


class DropCols(TransformerMixin, BaseEstimator):
class DropCols(TransformerMixin, BaseTransformer):
"""Drop a subset of a DataFrame's columns.

The other columns are kept in their original order. A ``ValueError`` is raised if
Expand Down
4 changes: 2 additions & 2 deletions skrub/_single_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import re
import textwrap

from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _utils
from ._base import BaseTransformer

__all__ = ["SingleColumnTransformer", "RejectColumn"]

Expand Down Expand Up @@ -120,7 +120,7 @@ class RejectColumn(ValueError):
pass


class SingleColumnTransformer(BaseEstimator):
class SingleColumnTransformer(BaseTransformer):
"""Base class for single-column transformers.

Such transformers are applied independently to each column by
Expand Down
3 changes: 2 additions & 1 deletion skrub/_squashing_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sklearn.preprocessing import RobustScaler
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted

from skrub._base import BaseTransformer
from skrub._sklearn_compat import validate_data


Expand Down Expand Up @@ -82,7 +83,7 @@ def transform(self, X):
return self.scale_ * (X - self.median_)


class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseTransformer):
r"""Perform robust centering and scaling followed by soft clipping.

When features have large outliers, smooth clipping prevents the outliers from
Expand Down
9 changes: 6 additions & 3 deletions skrub/_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
from collections.abc import Iterable

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.base import TransformerMixin, clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _utils
from . import selectors as s
from ._base import BaseTransformer
from ._check_input import CheckInputDataFrame
from ._clean_categories import CleanCategories
from ._clean_null_strings import CleanNullStrings
Expand All @@ -31,6 +32,8 @@


class PassThrough(SingleColumnTransformer):
_doc_link_module = ""

def fit_transform(self, column, y=None):
return column

Expand Down Expand Up @@ -181,7 +184,7 @@ def _get_preprocessors(
return steps


class Cleaner(TransformerMixin, BaseEstimator):
class Cleaner(TransformerMixin, BaseTransformer):
"""Column-wise consistency checks and sanitization of dtypes, null values and dates.

The ``Cleaner`` performs some consistency checks and basic preprocessing
Expand Down Expand Up @@ -539,7 +542,7 @@ def get_feature_names_out(self, input_features=None):
return np.asarray(self.all_outputs_)


class TableVectorizer(TransformerMixin, BaseEstimator):
class TableVectorizer(TransformerMixin, BaseTransformer):
"""Transform a dataframe to a numeric (vectorized) representation.

This transformer preprocesses the given dataframe by first cleaning the data
Expand Down
53 changes: 53 additions & 0 deletions skrub/tests/test_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import re

from sklearn.utils import estimator_html_repr

from skrub import (
ApplyToCols,
Cleaner,
DropCols,
SelectCols,
StringEncoder,
TableVectorizer,
)


def test_doc_link_apply_to_cols():
"""The wrapped transformer's doc link appears in the HTML repr of ApplyToCols."""
html = estimator_html_repr(ApplyToCols(StringEncoder()))
links = set(re.findall(r'href="(https?://[^#"]+)"', html))
assert (
"https://skrub-data.org/stable/reference/generated/skrub.ApplyToCols.html"
in links
)

html = estimator_html_repr(ApplyToCols(TableVectorizer()))
links = set(re.findall(r'href="(https?://[^#"]+)"', html))
assert (
"https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html"
in links
)


def test_doc_link_skrub_class_select_cols():
"""Public skrub classes get a link to skrub documentation."""
link = SelectCols(cols=[])._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.SelectCols.html"
)
link = DropCols(cols=[])._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.DropCols.html"
)


def test_doc_link_table_vectorizer():
"""Public skrub classes get a link to skrub documentation."""
link = TableVectorizer()._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html"
)
link = Cleaner()._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.Cleaner.html"
)
27 changes: 27 additions & 0 deletions skrub/tests/test_single_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

from skrub import GapEncoder
from skrub import _dataframe as sbd
from skrub._single_column_transformer import (
SingleColumnTransformer,
Expand Down Expand Up @@ -91,6 +92,32 @@ def fit(self, column, y=None):
assert transformer.get_feature_names_out() == [sbd.name(column)]


def test_doc_link_skrub_class():
"""Public skrub classes get a link to skrub documentation."""
link = GapEncoder()._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.GapEncoder.html"
)


def test_doc_link_user_defined_subclass():
"""User-defined subclasses outside skrub.* produce no link."""

class MyTransformer(SingleColumnTransformer):
def fit_transform(self, column, y=None):
return column

def transform(self, column):
return column

# Needed to simulate a user-defined class outside of skrub.*.
# Since this test is running in a module named
# "skrub.tests.test_single_column_transformer", that is the default modulee
# for MyTransformer, which would cause a doc link to be generated.
MyTransformer.__module__ = "user_package"
assert MyTransformer()._get_doc_link() == ""


def test_is_single_column_transformer():
class S:
__single_column_transformer__ = True
Expand Down
Loading