From 5fcf6297fbda310440b807be41d91553a04460b4 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 13:04:18 +0200 Subject: [PATCH 01/20] ENH - adding doc link to html repr of estimators --- skrub/_single_column_transformer.py | 17 +++++++++++++ skrub/_table_vectorizer.py | 38 +++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/skrub/_single_column_transformer.py b/skrub/_single_column_transformer.py index 9727f7d14..93890909b 100644 --- a/skrub/_single_column_transformer.py +++ b/skrub/_single_column_transformer.py @@ -143,6 +143,23 @@ class SingleColumnTransformer(BaseEstimator): """ __single_column_transformer__ = True + _doc_link_module = "skrub" + + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) def set_output(self, *, transform=None): """ diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 7b2f12993..cc4e9fe10 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -29,6 +29,8 @@ class PassThrough(SingleColumnTransformer): + _doc_link_module = "" + def fit_transform(self, column, y=None): return column @@ -338,6 +340,24 @@ class Cleaner(TransformerMixin, BaseEstimator): [DropUninformative()] """ + _doc_link_module = "skrub" + + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) + def __init__( self, drop_null_fraction=1.0, @@ -796,6 +816,24 @@ class TableVectorizer(TransformerMixin, BaseEstimator): ValueError: Column 'A' used twice in 'specific_transformers', at indices 0 and 1. """ # noqa: E501 + _doc_link_module = "skrub" + + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) + def __init__( self, *, From baf63fbd76588811fe996c15e311ac3cfc0d01cd Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 13:14:30 +0200 Subject: [PATCH 02/20] moving the new methods --- skrub/_apply_to_cols.py | 18 ++++++++++++++ skrub/_select_cols.py | 36 ++++++++++++++++++++++++++++ skrub/_table_vectorizer.py | 48 +++++++++++++++++++++++++------------- 3 files changed, 86 insertions(+), 16 deletions(-) diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index 523e6a341..195c0564c 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -276,6 +276,24 @@ class ApplyToCols(TransformerMixin, BaseEstimator): 1 10.0 100.0 1.0 1.0 """ + _doc_link_module = "skrub" + + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) + def __init__( self, transformer, diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py index 34cc97387..da001f455 100644 --- a/skrub/_select_cols.py +++ b/skrub/_select_cols.py @@ -43,6 +43,8 @@ class SelectCols(TransformerMixin, BaseEstimator): ValueError: The following columns are requested for selection but missing from dataframe: ['X'] """ # noqa: E501 + _doc_link_module = "skrub" + def __init__(self, cols): self.cols = cols @@ -98,6 +100,22 @@ def get_feature_names_out(self, input_features=None): check_is_fitted(self, "columns_") return self.columns_ + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) + class DropCols(TransformerMixin, BaseEstimator): """Drop a subset of a DataFrame's columns. @@ -138,6 +156,8 @@ class DropCols(TransformerMixin, BaseEstimator): ValueError: The following columns are requested for selection but missing from dataframe: ['X'] """ # noqa: E501 + _doc_link_module = "skrub" + def __init__(self, cols): self.cols = cols @@ -195,6 +215,22 @@ def get_feature_names_out(self, input_features=None): check_is_fitted(self, "kept_cols_") return self.kept_cols_ + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) + class Drop(SingleColumnTransformer): def fit_transform(self, column, y=None): diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index cc4e9fe10..697bab3f7 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -342,22 +342,6 @@ class Cleaner(TransformerMixin, BaseEstimator): _doc_link_module = "skrub" - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) - - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) - def __init__( self, drop_null_fraction=1.0, @@ -477,6 +461,22 @@ def get_feature_names_out(self, input_features=None): check_is_fitted(self, "all_outputs_") return np.asarray(self.all_outputs_) + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) + class TableVectorizer(TransformerMixin, BaseEstimator): """Transform a dataframe to a numeric (vectorized) representation. @@ -1114,3 +1114,19 @@ def get_feature_names_out(self, input_features=None): """ check_is_fitted(self, "all_outputs_") return np.asarray(self.all_outputs_) + + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) From 0fde8fd401cedb3c9ad5369e451e806f3d262ca3 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 13:23:58 +0200 Subject: [PATCH 03/20] adding more --- skrub/_data_ops/_estimator.py | 15 +++++++++++++++ skrub/_squashing_scaler.py | 18 ++++++++++++++++++ skrub/_table_vectorizer.py | 16 ---------------- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/skrub/_data_ops/_estimator.py b/skrub/_data_ops/_estimator.py index 43988b96b..b2e8d4339 100644 --- a/skrub/_data_ops/_estimator.py +++ b/skrub/_data_ops/_estimator.py @@ -560,6 +560,21 @@ def describe_params(self): """ return describe_params(eval_choices(self.data_op), choice_graph(self.data_op)) + _doc_link_module = "skrub" + + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) + def _to_Xy_pipeline(learner, environment): return learner.__skrub_to_Xy_pipeline__(environment) diff --git a/skrub/_squashing_scaler.py b/skrub/_squashing_scaler.py index a2acf24b7..ffc2fa104 100644 --- a/skrub/_squashing_scaler.py +++ b/skrub/_squashing_scaler.py @@ -335,3 +335,21 @@ def transform(self, X): X_tr = _set_zeros(X_tr, self.zero_cols_) return _soft_clip(X_tr, self.max_absolute_value, mask_inf) + + _doc_link_module = "skrub" + + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 697bab3f7..20cc88249 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -818,22 +818,6 @@ class TableVectorizer(TransformerMixin, BaseEstimator): _doc_link_module = "skrub" - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) - - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) - def __init__( self, *, From e40ecfaf1c6b9467e4cdd23870abab3f352af171 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 14:31:18 +0200 Subject: [PATCH 04/20] fixing applytocols --- skrub/_apply_to_cols.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index 195c0564c..55047586c 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -8,6 +8,7 @@ from . import selectors from ._apply_to_each_col import ApplyToEachCol from ._apply_to_sub_frame import ApplyToSubFrame +from ._sklearn_compat import _VisualBlock from ._wrap_transformer import wrap_transformer _SELECT_ALL_COLUMNS = selectors.all() @@ -431,6 +432,18 @@ def get_feature_names_out(self, input_features=None): return self._wrapped_transformer.get_feature_names_out(input_features) + def _sk_visual_block_(self): + # This is needed because when ApplyToCols is used with a transformer like + # TableVectorizser then the estimator is shown as a parallel block, which + # would not add the documentation link. + # With this override the problem is fixed. + return _VisualBlock( + "serial", + [self.transformer], + names=[self.transformer.__class__.__name__], + name_details=[str(self.transformer)], + ) + def __getattr__(self, name): if name == "transformers_" and isinstance( getattr(self, "_wrapped_transformer", None), ApplyToSubFrame From 3156828b9568d149ae8640853a6201266b005d7e Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 16:10:18 +0200 Subject: [PATCH 05/20] fixing typo --- skrub/_apply_to_cols.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index 55047586c..d87f3f0ad 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -434,7 +434,7 @@ def get_feature_names_out(self, input_features=None): def _sk_visual_block_(self): # This is needed because when ApplyToCols is used with a transformer like - # TableVectorizser then the estimator is shown as a parallel block, which + # TableVectorizer then the estimator is shown as a parallel block, which # would not add the documentation link. # With this override the problem is fixed. return _VisualBlock( From 8213776322fca2976426ec67e4ff031156687583 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 17:02:34 +0200 Subject: [PATCH 06/20] adding tests --- skrub/tests/test_apply_to_cols.py | 21 ++++++++++++++++- skrub/tests/test_single_column_transformer.py | 23 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/skrub/tests/test_apply_to_cols.py b/skrub/tests/test_apply_to_cols.py index ab0fd3b62..31336d136 100644 --- a/skrub/tests/test_apply_to_cols.py +++ b/skrub/tests/test_apply_to_cols.py @@ -1,11 +1,13 @@ import datetime +import re import numpy as np import pytest from sklearn.exceptions import NotFittedError from sklearn.preprocessing import OrdinalEncoder +from sklearn.utils import estimator_html_repr -from skrub import ApplyToCols +from skrub import ApplyToCols, StringEncoder, TableVectorizer from skrub import _dataframe as sbd from skrub import selectors as s from skrub._to_datetime import ToDatetime @@ -162,6 +164,23 @@ def test_get_feature_names_out_after_fit(df_module): assert feature_names == ["date_col"] +def test_doc_link_wrapped_transformer_in_html_repr(): + """The wrapped transformer's doc link appears in the HTML repr of ApplyToCols.""" + html = estimator_html_repr(ApplyToCols(StringEncoder())) + links = set(re.findall(r'href="(https?://[^#"]+)"', html)) + assert ( + "https://skrub-data.org/stable/reference/generated/skrub.StringEncoder.html" + in links + ) + + html = estimator_html_repr(ApplyToCols(TableVectorizer())) + links = set(re.findall(r'href="(https?://[^#"]+)"', html)) + assert ( + "https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html" + in links + ) + + def test_getattr_raises_for_wrong_attribute(df_module): """Test __getattr__ raises proper AttributeError for wrong attributes.""" # Test that accessing transformers_ on non-single-column transformer raises error diff --git a/skrub/tests/test_single_column_transformer.py b/skrub/tests/test_single_column_transformer.py index 12914df3e..6e49bd6fe 100644 --- a/skrub/tests/test_single_column_transformer.py +++ b/skrub/tests/test_single_column_transformer.py @@ -3,6 +3,7 @@ from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import StandardScaler +from skrub import GapEncoder from skrub import _dataframe as sbd from skrub._single_column_transformer import ( SingleColumnTransformer, @@ -91,6 +92,28 @@ def fit(self, column, y=None): assert transformer.get_feature_names_out() == [sbd.name(column)] +def test_doc_link_skrub_class(): + """Public skrub classes get a link to skrub documentation.""" + link = GapEncoder()._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.GapEncoder.html" + ) + + +def test_doc_link_user_defined_subclass(): + """User-defined subclasses outside skrub.* produce no link.""" + + class MyTransformer(SingleColumnTransformer): + def fit_transform(self, column, y=None): + return column + + def transform(self, column): + return column + + MyTransformer.__module__ = "user_package" + assert MyTransformer()._get_doc_link() == "" + + def test_is_single_column_transformer(): class S: __single_column_transformer__ = True From 4a41b3ff733a43857f37c48024cd9580e131e2d6 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 17:07:30 +0200 Subject: [PATCH 07/20] adding a comment --- skrub/tests/test_single_column_transformer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skrub/tests/test_single_column_transformer.py b/skrub/tests/test_single_column_transformer.py index 6e49bd6fe..003b2b3f0 100644 --- a/skrub/tests/test_single_column_transformer.py +++ b/skrub/tests/test_single_column_transformer.py @@ -110,6 +110,10 @@ def fit_transform(self, column, y=None): def transform(self, column): return column + # Needed to simulate a user-defined class outside of skrub.*. + # Since this test is running in a module named + # "skrub.tests.test_single_column_transformer", that is the default modulee + # for MyTransformer, which would cause a doc link to be generated. MyTransformer.__module__ = "user_package" assert MyTransformer()._get_doc_link() == "" From b3403c060e3f63f2a7ac091f123a33870f043921 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 17:12:05 +0200 Subject: [PATCH 08/20] changelog --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index b5009cd15..339cbb569 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -26,6 +26,9 @@ Changes - The row indices of training and testing samples are now also included in the dictionaries produced by :meth:`DataOp.skb.iter_cv_splits`. :pr:`2012` by :user:`Jérôme Dockès `. +- Skrub estimators now correctly show links to the documentation in the HTML + representation that is generated for notebooks. :pr:`2036` by :user:`Riccardo + Cappuzzo `. Bugfixes -------- From ade23b8a12ebcfc7d15db669ef0d905814a6c0d5 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 21 Apr 2026 18:00:49 +0200 Subject: [PATCH 09/20] removing unneeded setter --- skrub/_apply_to_cols.py | 4 ---- skrub/_data_ops/_estimator.py | 4 ---- skrub/_select_cols.py | 8 -------- skrub/_single_column_transformer.py | 4 ---- skrub/_squashing_scaler.py | 4 ---- skrub/_table_vectorizer.py | 8 -------- 6 files changed, 32 deletions(-) diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index d87f3f0ad..f4dd6d7bf 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -291,10 +291,6 @@ def _doc_link_template(self): "{estimator_module}.{estimator_name}.html", ) - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) - def __init__( self, transformer, diff --git a/skrub/_data_ops/_estimator.py b/skrub/_data_ops/_estimator.py index b2e8d4339..9a81c7a19 100644 --- a/skrub/_data_ops/_estimator.py +++ b/skrub/_data_ops/_estimator.py @@ -571,10 +571,6 @@ def _doc_link_template(self): "{estimator_module}.{estimator_name}.html", ) - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) - def _to_Xy_pipeline(learner, environment): return learner.__skrub_to_Xy_pipeline__(environment) diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py index da001f455..d92252c56 100644 --- a/skrub/_select_cols.py +++ b/skrub/_select_cols.py @@ -112,10 +112,6 @@ def _doc_link_template(self): "{estimator_module}.{estimator_name}.html", ) - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) - class DropCols(TransformerMixin, BaseEstimator): """Drop a subset of a DataFrame's columns. @@ -227,10 +223,6 @@ def _doc_link_template(self): "{estimator_module}.{estimator_name}.html", ) - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) - class Drop(SingleColumnTransformer): def fit_transform(self, column, y=None): diff --git a/skrub/_single_column_transformer.py b/skrub/_single_column_transformer.py index 93890909b..d9883ec7d 100644 --- a/skrub/_single_column_transformer.py +++ b/skrub/_single_column_transformer.py @@ -157,10 +157,6 @@ def _doc_link_template(self): "{estimator_module}.{estimator_name}.html", ) - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) - def set_output(self, *, transform=None): """ Default no-op implementation for set_output. diff --git a/skrub/_squashing_scaler.py b/skrub/_squashing_scaler.py index ffc2fa104..fc44aa004 100644 --- a/skrub/_squashing_scaler.py +++ b/skrub/_squashing_scaler.py @@ -349,7 +349,3 @@ def _doc_link_template(self): "https://skrub-data.org/stable/reference/generated/" "{estimator_module}.{estimator_name}.html", ) - - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 20cc88249..243b0c661 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -473,10 +473,6 @@ def _doc_link_template(self): "{estimator_module}.{estimator_name}.html", ) - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) - class TableVectorizer(TransformerMixin, BaseEstimator): """Transform a dataframe to a numeric (vectorized) representation. @@ -1110,7 +1106,3 @@ def _doc_link_template(self): "https://skrub-data.org/stable/reference/generated/" "{estimator_module}.{estimator_name}.html", ) - - @_doc_link_template.setter - def _doc_link_template(self, value): - setattr(self, "__doc_link_template", value) From fe0fc1355a1da905c0e3631d62cf8c4386d94152 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 22 Apr 2026 10:00:05 +0200 Subject: [PATCH 10/20] adding more tests for coverage --- skrub/tests/test_select_cols.py | 12 ++++++++++++ skrub/tests/test_squashing_scaler.py | 8 ++++++++ skrub/tests/test_table_vectorizer.py | 12 ++++++++++++ 3 files changed, 32 insertions(+) diff --git a/skrub/tests/test_select_cols.py b/skrub/tests/test_select_cols.py index 739c86eb7..26cba9d3a 100644 --- a/skrub/tests/test_select_cols.py +++ b/skrub/tests/test_select_cols.py @@ -88,3 +88,15 @@ def test_get_feature_names_out(df): pipeline = make_pipeline(DropCols(cols=["A", "B"]), DummyClassifier()) pipeline.fit(df, df["C"]) assert pipeline[:-1].get_feature_names_out() == ["C"] + + +def test_doc_link_skrub_class(): + """Public skrub classes get a link to skrub documentation.""" + link = SelectCols(cols=[])._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.SelectCols.html" + ) + link = DropCols(cols=[])._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.DropCols.html" + ) diff --git a/skrub/tests/test_squashing_scaler.py b/skrub/tests/test_squashing_scaler.py index f40ba1731..e74e909e3 100644 --- a/skrub/tests/test_squashing_scaler.py +++ b/skrub/tests/test_squashing_scaler.py @@ -144,3 +144,11 @@ def test_squashing_scaler_known_values(df_module): -1, 1 ) assert_almost_equal(X_target, X_out) + + +def test_doc_link_skrub_class(): + """Public skrub classes get a link to skrub documentation.""" + link = SquashingScaler()._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.SquashingScaler.html" + ) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index fa5d4f13a..eaf1a109b 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -1109,3 +1109,15 @@ def test_pipeline_in_table_vectorizer(df_module): fit_transform_result = tv.fit_transform(df) transform_result = tv.transform(df) assert fit_transform_result.shape == transform_result.shape == (2, 4) + + +def test_doc_link_skrub_class(): + """Public skrub classes get a link to skrub documentation.""" + link = TableVectorizer()._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html" + ) + link = Cleaner()._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.Cleaner.html" + ) From 1f1a82854b3a03dc338125dfcbd5d46962a23a38 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 8 Jun 2026 16:48:16 +0200 Subject: [PATCH 11/20] moving changes to a single file --- skrub/_apply_to_cols.py | 19 ++-------- skrub/_base.py | 33 +++++++++++++++++ skrub/_select_cols.py | 7 ++-- skrub/_squashing_scaler.py | 17 ++------- skrub/_table_vectorizer.py | 7 ++-- skrub/tests/test_apply_to_cols.py | 21 +---------- skrub/tests/test_base.py | 53 ++++++++++++++++++++++++++++ skrub/tests/test_select_cols.py | 12 ------- skrub/tests/test_squashing_scaler.py | 8 ----- skrub/tests/test_table_vectorizer.py | 12 ------- 10 files changed, 100 insertions(+), 89 deletions(-) create mode 100644 skrub/_base.py create mode 100644 skrub/tests/test_base.py diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index 029982850..77fb167cc 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -3,18 +3,19 @@ based on the type of the transformer passed to it. """ -from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted +from sklearn.base import TransformerMixin, check_is_fitted from . import selectors from ._apply_to_each_col import ApplyToEachCol from ._apply_to_sub_frame import ApplyToSubFrame +from ._base import BaseTransformer from ._sklearn_compat import _VisualBlock from ._wrap_transformer import wrap_transformer _SELECT_ALL_COLUMNS = selectors.all() -class ApplyToCols(TransformerMixin, BaseEstimator): +class ApplyToCols(TransformerMixin, BaseTransformer): """ Apply a transformer to selected columns in a dataframe. @@ -292,20 +293,6 @@ class ApplyToCols(TransformerMixin, BaseEstimator): 1 10.0 100.0 1.0 1.0 """ # noqa: E501 - _doc_link_module = "skrub" - - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) - def __init__( self, transformer, diff --git a/skrub/_base.py b/skrub/_base.py new file mode 100644 index 000000000..e8a48fc50 --- /dev/null +++ b/skrub/_base.py @@ -0,0 +1,33 @@ +from sklearn.base import BaseEstimator + + +class BaseTransformer(BaseEstimator): + _doc_link_module = "skrub" + + # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, + # which also defines _doc_link_template as a property, and we want to be able + # to override it. + @property + def _doc_link_template(self): + return getattr( + self, + "__doc_link_template", + "https://skrub-data.org/stable/reference/generated/" + "{estimator_module}.{estimator_name}.html", + ) + + def fit(self, X, y=None): + return self + + def fit_transform(self, X, y=None): + return self.transform(X) + + def transform(self, X): + # This method should be overridden by subclasses. We raise an error here to + # make it clear to users that they need to implement this method if they are + # creating a custom transformer class. We also catch the error in check_output + # to provide a more informative error message if the output of transform has the + # wrong type. + raise NotImplementedError( + f"{self.__class__.__name__} does not implement the 'transform' method." + ) diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py index d92252c56..86a33861c 100644 --- a/skrub/_select_cols.py +++ b/skrub/_select_cols.py @@ -1,10 +1,11 @@ -from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted +from sklearn.base import TransformerMixin, check_is_fitted from . import selectors as s +from ._base import BaseTransformer from ._single_column_transformer import SingleColumnTransformer -class SelectCols(TransformerMixin, BaseEstimator): +class SelectCols(TransformerMixin, BaseTransformer): """Select a subset of a DataFrame's columns. A ``ValueError`` is raised if any of the provided column names are not in the @@ -113,7 +114,7 @@ def _doc_link_template(self): ) -class DropCols(TransformerMixin, BaseEstimator): +class DropCols(TransformerMixin, BaseTransformer): """Drop a subset of a DataFrame's columns. The other columns are kept in their original order. A ``ValueError`` is raised if diff --git a/skrub/_squashing_scaler.py b/skrub/_squashing_scaler.py index fc44aa004..fcc055e40 100644 --- a/skrub/_squashing_scaler.py +++ b/skrub/_squashing_scaler.py @@ -5,6 +5,7 @@ from sklearn.preprocessing import RobustScaler from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted +from skrub._base import BaseTransformer from skrub._sklearn_compat import validate_data @@ -82,7 +83,7 @@ def transform(self, X): return self.scale_ * (X - self.median_) -class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseTransformer): r"""Perform robust centering and scaling followed by soft clipping. When features have large outliers, smooth clipping prevents the outliers from @@ -335,17 +336,3 @@ def transform(self, X): X_tr = _set_zeros(X_tr, self.zero_cols_) return _soft_clip(X_tr, self.max_absolute_value, mask_inf) - - _doc_link_module = "skrub" - - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 70193a9ba..b3b2e3e27 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -4,7 +4,7 @@ from collections.abc import Iterable import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, clone +from sklearn.base import TransformerMixin, clone from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.utils.validation import check_is_fitted @@ -12,6 +12,7 @@ from . import _dataframe as sbd from . import _utils from . import selectors as s +from ._base import BaseTransformer from ._check_input import CheckInputDataFrame from ._clean_categories import CleanCategories from ._clean_null_strings import CleanNullStrings @@ -183,7 +184,7 @@ def _get_preprocessors( return steps -class Cleaner(TransformerMixin, BaseEstimator): +class Cleaner(TransformerMixin, BaseTransformer): """Column-wise consistency checks and sanitization of dtypes, null values and dates. The ``Cleaner`` performs some consistency checks and basic preprocessing @@ -555,7 +556,7 @@ def _doc_link_template(self): ) -class TableVectorizer(TransformerMixin, BaseEstimator): +class TableVectorizer(TransformerMixin, BaseTransformer): """Transform a dataframe to a numeric (vectorized) representation. This transformer preprocesses the given dataframe by first cleaning the data diff --git a/skrub/tests/test_apply_to_cols.py b/skrub/tests/test_apply_to_cols.py index 34e3e6a27..ab30926bd 100644 --- a/skrub/tests/test_apply_to_cols.py +++ b/skrub/tests/test_apply_to_cols.py @@ -1,14 +1,12 @@ import datetime -import re import sys import numpy as np import pytest from sklearn.exceptions import NotFittedError from sklearn.preprocessing import OrdinalEncoder, StandardScaler -from sklearn.utils import estimator_html_repr -from skrub import ApplyToCols, StringEncoder, TableVectorizer +from skrub import ApplyToCols from skrub import _dataframe as sbd from skrub import selectors as s from skrub._to_datetime import ToDatetime @@ -213,23 +211,6 @@ def test_get_feature_names_out_after_fit(df_module): assert feature_names == ["date_col"] -def test_doc_link_wrapped_transformer_in_html_repr(): - """The wrapped transformer's doc link appears in the HTML repr of ApplyToCols.""" - html = estimator_html_repr(ApplyToCols(StringEncoder())) - links = set(re.findall(r'href="(https?://[^#"]+)"', html)) - assert ( - "https://skrub-data.org/stable/reference/generated/skrub.StringEncoder.html" - in links - ) - - html = estimator_html_repr(ApplyToCols(TableVectorizer())) - links = set(re.findall(r'href="(https?://[^#"]+)"', html)) - assert ( - "https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html" - in links - ) - - def test_getattr_raises_for_wrong_attribute(df_module): """Test __getattr__ raises proper AttributeError for wrong attributes.""" # Test that accessing transformers_ on non-single-column transformer raises error diff --git a/skrub/tests/test_base.py b/skrub/tests/test_base.py new file mode 100644 index 000000000..b26d34ed8 --- /dev/null +++ b/skrub/tests/test_base.py @@ -0,0 +1,53 @@ +import re + +from sklearn.utils import estimator_html_repr + +from skrub import ( + ApplyToCols, + Cleaner, + DropCols, + SelectCols, + StringEncoder, + TableVectorizer, +) + + +def test_doc_link_apply_to_cols(): + """The wrapped transformer's doc link appears in the HTML repr of ApplyToCols.""" + html = estimator_html_repr(ApplyToCols(StringEncoder())) + links = set(re.findall(r'href="(https?://[^#"]+)"', html)) + assert ( + "https://skrub-data.org/stable/reference/generated/skrub.StringEncoder.html" + in links + ) + + html = estimator_html_repr(ApplyToCols(TableVectorizer())) + links = set(re.findall(r'href="(https?://[^#"]+)"', html)) + assert ( + "https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html" + in links + ) + + +def test_doc_link_skrub_class_select_cols(): + """Public skrub classes get a link to skrub documentation.""" + link = SelectCols(cols=[])._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.SelectCols.html" + ) + link = DropCols(cols=[])._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.DropCols.html" + ) + + +def test_doc_link_table_vectorizer(): + """Public skrub classes get a link to skrub documentation.""" + link = TableVectorizer()._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html" + ) + link = Cleaner()._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.Cleaner.html" + ) diff --git a/skrub/tests/test_select_cols.py b/skrub/tests/test_select_cols.py index 26cba9d3a..739c86eb7 100644 --- a/skrub/tests/test_select_cols.py +++ b/skrub/tests/test_select_cols.py @@ -88,15 +88,3 @@ def test_get_feature_names_out(df): pipeline = make_pipeline(DropCols(cols=["A", "B"]), DummyClassifier()) pipeline.fit(df, df["C"]) assert pipeline[:-1].get_feature_names_out() == ["C"] - - -def test_doc_link_skrub_class(): - """Public skrub classes get a link to skrub documentation.""" - link = SelectCols(cols=[])._get_doc_link() - assert link == ( - "https://skrub-data.org/stable/reference/generated/skrub.SelectCols.html" - ) - link = DropCols(cols=[])._get_doc_link() - assert link == ( - "https://skrub-data.org/stable/reference/generated/skrub.DropCols.html" - ) diff --git a/skrub/tests/test_squashing_scaler.py b/skrub/tests/test_squashing_scaler.py index e74e909e3..f40ba1731 100644 --- a/skrub/tests/test_squashing_scaler.py +++ b/skrub/tests/test_squashing_scaler.py @@ -144,11 +144,3 @@ def test_squashing_scaler_known_values(df_module): -1, 1 ) assert_almost_equal(X_target, X_out) - - -def test_doc_link_skrub_class(): - """Public skrub classes get a link to skrub documentation.""" - link = SquashingScaler()._get_doc_link() - assert link == ( - "https://skrub-data.org/stable/reference/generated/skrub.SquashingScaler.html" - ) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index cb94fe965..43d438071 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -1250,18 +1250,6 @@ def test_pipeline_in_table_vectorizer(df_module): assert fit_transform_result.shape == transform_result.shape == (2, 4) -def test_doc_link_skrub_class(): - """Public skrub classes get a link to skrub documentation.""" - link = TableVectorizer()._get_doc_link() - assert link == ( - "https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html" - ) - link = Cleaner()._get_doc_link() - assert link == ( - "https://skrub-data.org/stable/reference/generated/skrub.Cleaner.html" - ) - - def test_duration_to_float(df_module): df = df_module.make_dataframe( { From 79cc5e984966b2761371738a36e376ae9b0f4940 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 8 Jun 2026 16:50:14 +0200 Subject: [PATCH 12/20] _ --- skrub/_base.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/skrub/_base.py b/skrub/_base.py index e8a48fc50..411bb7f49 100644 --- a/skrub/_base.py +++ b/skrub/_base.py @@ -15,19 +15,3 @@ def _doc_link_template(self): "https://skrub-data.org/stable/reference/generated/" "{estimator_module}.{estimator_name}.html", ) - - def fit(self, X, y=None): - return self - - def fit_transform(self, X, y=None): - return self.transform(X) - - def transform(self, X): - # This method should be overridden by subclasses. We raise an error here to - # make it clear to users that they need to implement this method if they are - # creating a custom transformer class. We also catch the error in check_output - # to provide a more informative error message if the output of transform has the - # wrong type. - raise NotImplementedError( - f"{self.__class__.__name__} does not implement the 'transform' method." - ) From 48949d2e137fd7737cf0d67d15604d049294e12c Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 8 Jun 2026 16:59:53 +0200 Subject: [PATCH 13/20] tests --- skrub/_select_cols.py | 28 ---------------------------- skrub/_single_column_transformer.py | 17 ++--------------- skrub/tests/test_base.py | 2 +- 3 files changed, 3 insertions(+), 44 deletions(-) diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py index 86a33861c..91fe7c658 100644 --- a/skrub/_select_cols.py +++ b/skrub/_select_cols.py @@ -44,8 +44,6 @@ class SelectCols(TransformerMixin, BaseTransformer): ValueError: The following columns are requested for selection but missing from dataframe: ['X'] """ # noqa: E501 - _doc_link_module = "skrub" - def __init__(self, cols): self.cols = cols @@ -101,18 +99,6 @@ def get_feature_names_out(self, input_features=None): check_is_fitted(self, "columns_") return self.columns_ - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) - class DropCols(TransformerMixin, BaseTransformer): """Drop a subset of a DataFrame's columns. @@ -153,8 +139,6 @@ class DropCols(TransformerMixin, BaseTransformer): ValueError: The following columns are requested for selection but missing from dataframe: ['X'] """ # noqa: E501 - _doc_link_module = "skrub" - def __init__(self, cols): self.cols = cols @@ -212,18 +196,6 @@ def get_feature_names_out(self, input_features=None): check_is_fitted(self, "kept_cols_") return self.kept_cols_ - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) - class Drop(SingleColumnTransformer): def fit_transform(self, column, y=None): diff --git a/skrub/_single_column_transformer.py b/skrub/_single_column_transformer.py index 6d651c2e3..936b6ae39 100644 --- a/skrub/_single_column_transformer.py +++ b/skrub/_single_column_transformer.py @@ -4,12 +4,12 @@ import re import textwrap -from sklearn.base import BaseEstimator from sklearn.pipeline import Pipeline from sklearn.utils.validation import check_is_fitted from . import _dataframe as sbd from . import _utils +from ._base import BaseTransformer __all__ = ["SingleColumnTransformer", "RejectColumn"] @@ -120,7 +120,7 @@ class RejectColumn(ValueError): pass -class SingleColumnTransformer(BaseEstimator): +class SingleColumnTransformer(BaseTransformer): """Base class for single-column transformers. Such transformers are applied independently to each column by @@ -144,19 +144,6 @@ class SingleColumnTransformer(BaseEstimator): """ __single_column_transformer__ = True - _doc_link_module = "skrub" - - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) def set_output(self, *, transform=None): """ diff --git a/skrub/tests/test_base.py b/skrub/tests/test_base.py index b26d34ed8..ff002ad9c 100644 --- a/skrub/tests/test_base.py +++ b/skrub/tests/test_base.py @@ -17,7 +17,7 @@ def test_doc_link_apply_to_cols(): html = estimator_html_repr(ApplyToCols(StringEncoder())) links = set(re.findall(r'href="(https?://[^#"]+)"', html)) assert ( - "https://skrub-data.org/stable/reference/generated/skrub.StringEncoder.html" + "https://skrub-data.org/stable/reference/generated/skrub.ApplyToCols.html" in links ) From f1f07d9f24533f7a6273d5402cf9d24b9e7dd7f3 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 8 Jun 2026 17:07:54 +0200 Subject: [PATCH 14/20] removing unnecessary code --- skrub/_table_vectorizer.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index b3b2e3e27..823e8b4b6 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -395,8 +395,6 @@ class Cleaner(TransformerMixin, BaseTransformer): [DropUninformative()] """ - _doc_link_module = "skrub" - def __init__( self, drop_null_fraction=1.0, @@ -543,18 +541,6 @@ def get_feature_names_out(self, input_features=None): check_is_fitted(self, "all_outputs_") return np.asarray(self.all_outputs_) - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) - class TableVectorizer(TransformerMixin, BaseTransformer): """Transform a dataframe to a numeric (vectorized) representation. @@ -899,8 +885,6 @@ class TableVectorizer(TransformerMixin, BaseTransformer): ValueError: Column 'A' used twice in 'specific_transformers', at indices 0 and 1. """ # noqa: E501 - _doc_link_module = "skrub" - def __init__( self, *, @@ -1183,15 +1167,3 @@ def get_feature_names_out(self, input_features=None): """ check_is_fitted(self, "all_outputs_") return np.asarray(self.all_outputs_) - - # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, - # which also defines _doc_link_template as a property, and we want to be able - # to override it. - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) From a1f520fbf797af2b1dcc9d6500a0908f2d97f911 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 16 Jun 2026 11:46:23 +0200 Subject: [PATCH 15/20] addressing comments from review --- skrub/_apply_to_cols.py | 17 ++--------------- skrub/_base.py | 9 ++++++++- skrub/_data_ops/_estimator.py | 14 ++------------ skrub/_data_ops/tests/test_estimators.py | 10 ++++++++++ skrub/_select_cols.py | 6 +++--- skrub/_single_column_transformer.py | 4 ++-- skrub/_squashing_scaler.py | 4 ++-- skrub/_table_vectorizer.py | 6 +++--- 8 files changed, 32 insertions(+), 38 deletions(-) diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index bda6f086c..7dcd641ba 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -8,14 +8,13 @@ from . import selectors from ._apply_to_each_col import ApplyToEachCol from ._apply_to_sub_frame import ApplyToSubFrame -from ._base import BaseTransformer -from ._sklearn_compat import _VisualBlock +from ._base import SkrubBaseTransformer from ._wrap_transformer import wrap_transformer _SELECT_ALL_COLUMNS = selectors.all() -class ApplyToCols(TransformerMixin, BaseTransformer): +class ApplyToCols(TransformerMixin, SkrubBaseTransformer): """ Apply a transformer to selected columns in a dataframe. @@ -433,18 +432,6 @@ def get_feature_names_out(self, input_features=None): return self._wrapped_transformer.get_feature_names_out(input_features) - def _sk_visual_block_(self): - # This is needed because when ApplyToCols is used with a transformer like - # TableVectorizer then the estimator is shown as a parallel block, which - # would not add the documentation link. - # With this override the problem is fixed. - return _VisualBlock( - "serial", - [self.transformer], - names=[self.transformer.__class__.__name__], - name_details=[str(self.transformer)], - ) - def __getattr__(self, name): if name == "transformers_" and isinstance( getattr(self, "_wrapped_transformer", None), ApplyToSubFrame diff --git a/skrub/_base.py b/skrub/_base.py index 411bb7f49..241f4470e 100644 --- a/skrub/_base.py +++ b/skrub/_base.py @@ -1,7 +1,14 @@ from sklearn.base import BaseEstimator -class BaseTransformer(BaseEstimator): +class SkrubBaseTransformer(BaseEstimator): + """Base class for all skrub transformers. + + This is a class that all skrub transformers inherit from. + For the moment, it's only used for the documentation url, but eventually + it will be used for other things as well. + """ + _doc_link_module = "skrub" # Defining this as a property because it inherits from _HTMLDocumentationLinkMixin, diff --git a/skrub/_data_ops/_estimator.py b/skrub/_data_ops/_estimator.py index 960ab587c..d797f2dc3 100644 --- a/skrub/_data_ops/_estimator.py +++ b/skrub/_data_ops/_estimator.py @@ -16,6 +16,7 @@ from .. import _dataframe as sbd from .. import _join_utils +from .._base import SkrubBaseTransformer from .._sklearn_compat import _safe_indexing, _VisualBlock from .._utils import set_module from . import _evaluation @@ -179,7 +180,7 @@ def _get_params_html(self, deep=True, doc_link=""): @set_module("skrub") -class SkrubLearner(_DataOpWrapperMixin, BaseEstimator): +class SkrubLearner(_DataOpWrapperMixin, SkrubBaseTransformer): """Learner that evaluates a skrub DataOp. This class is not meant to be instantiated manually, ``SkrubLearner`` @@ -787,17 +788,6 @@ def describe_params(self): """ return describe_params(eval_choices(self.data_op), choice_graph(self.data_op)) - _doc_link_module = "skrub" - - @property - def _doc_link_template(self): - return getattr( - self, - "__doc_link_template", - "https://skrub-data.org/stable/reference/generated/" - "{estimator_module}.{estimator_name}.html", - ) - def _to_Xy_pipeline(learner, environment): return learner.__skrub_to_Xy_pipeline__(environment) diff --git a/skrub/_data_ops/tests/test_estimators.py b/skrub/_data_ops/tests/test_estimators.py index 34d84f1b9..007a17a0a 100644 --- a/skrub/_data_ops/tests/test_estimators.py +++ b/skrub/_data_ops/tests/test_estimators.py @@ -1460,3 +1460,13 @@ def load_data(): pred = X.skb.apply(DummyClassifier(), y=y) search = pred.skb.make_grid_search(scoring="roc_auc").fit({}) assert search.results_.shape[0] == 1 + + +def test_learner_docstring(): + data_op, data = get_data_op_and_data("simple") + split = data_op.skb.train_test_split(data) + learner = data_op.skb.make_learner().fit(split["train"]) + link = learner._get_doc_link() + assert link == ( + "https://skrub-data.org/stable/reference/generated/skrub.SkrubLearner.html" + ) diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py index e2aaf4509..4dac4d1dc 100644 --- a/skrub/_select_cols.py +++ b/skrub/_select_cols.py @@ -1,11 +1,11 @@ from sklearn.base import TransformerMixin, check_is_fitted from . import selectors as s -from ._base import BaseTransformer +from ._base import SkrubBaseTransformer from ._single_column_transformer import SingleColumnTransformer -class SelectCols(TransformerMixin, BaseTransformer): +class SelectCols(TransformerMixin, SkrubBaseTransformer): """Select a subset of a DataFrame's columns. A ``ValueError`` is raised if any of the provided column names are not in the @@ -100,7 +100,7 @@ def get_feature_names_out(self, input_features=None): return self.columns_ -class DropCols(TransformerMixin, BaseTransformer): +class DropCols(TransformerMixin, SkrubBaseTransformer): """Drop a subset of a DataFrame's columns. The other columns are kept in their original order. A ``ValueError`` is raised if diff --git a/skrub/_single_column_transformer.py b/skrub/_single_column_transformer.py index 936b6ae39..f7f1e913d 100644 --- a/skrub/_single_column_transformer.py +++ b/skrub/_single_column_transformer.py @@ -9,7 +9,7 @@ from . import _dataframe as sbd from . import _utils -from ._base import BaseTransformer +from ._base import SkrubBaseTransformer __all__ = ["SingleColumnTransformer", "RejectColumn"] @@ -120,7 +120,7 @@ class RejectColumn(ValueError): pass -class SingleColumnTransformer(BaseTransformer): +class SingleColumnTransformer(SkrubBaseTransformer): """Base class for single-column transformers. Such transformers are applied independently to each column by diff --git a/skrub/_squashing_scaler.py b/skrub/_squashing_scaler.py index fcc055e40..4eae99355 100644 --- a/skrub/_squashing_scaler.py +++ b/skrub/_squashing_scaler.py @@ -5,7 +5,7 @@ from sklearn.preprocessing import RobustScaler from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted -from skrub._base import BaseTransformer +from skrub._base import SkrubBaseTransformer from skrub._sklearn_compat import validate_data @@ -83,7 +83,7 @@ def transform(self, X): return self.scale_ * (X - self.median_) -class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseTransformer): +class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, SkrubBaseTransformer): r"""Perform robust centering and scaling followed by soft clipping. When features have large outliers, smooth clipping prevents the outliers from diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 823e8b4b6..59910dafe 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -12,7 +12,7 @@ from . import _dataframe as sbd from . import _utils from . import selectors as s -from ._base import BaseTransformer +from ._base import SkrubBaseTransformer from ._check_input import CheckInputDataFrame from ._clean_categories import CleanCategories from ._clean_null_strings import CleanNullStrings @@ -184,7 +184,7 @@ def _get_preprocessors( return steps -class Cleaner(TransformerMixin, BaseTransformer): +class Cleaner(TransformerMixin, SkrubBaseTransformer): """Column-wise consistency checks and sanitization of dtypes, null values and dates. The ``Cleaner`` performs some consistency checks and basic preprocessing @@ -542,7 +542,7 @@ def get_feature_names_out(self, input_features=None): return np.asarray(self.all_outputs_) -class TableVectorizer(TransformerMixin, BaseTransformer): +class TableVectorizer(TransformerMixin, SkrubBaseTransformer): """Transform a dataframe to a numeric (vectorized) representation. This transformer preprocesses the given dataframe by first cleaning the data From 20702bd8292f7108da44cb86b2d1b9abec757dd4 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 16 Jun 2026 12:08:57 +0200 Subject: [PATCH 16/20] bringing back code block and better comment --- skrub/_apply_to_cols.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index 7dcd641ba..cd4eaf427 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -9,6 +9,7 @@ from ._apply_to_each_col import ApplyToEachCol from ._apply_to_sub_frame import ApplyToSubFrame from ._base import SkrubBaseTransformer +from ._sklearn_compat import _VisualBlock from ._wrap_transformer import wrap_transformer _SELECT_ALL_COLUMNS = selectors.all() @@ -432,6 +433,20 @@ def get_feature_names_out(self, input_features=None): return self._wrapped_transformer.get_feature_names_out(input_features) + def _sk_visual_block_(self): + # This is needed because cases like ApplyToCols(TableVectorizer()) + # would show the TableVectorizer as a parallel block, which would not + # add the documentation link. With this override the problem is fixed. + # The same problem happens for ApplyToCols(ApplyToCols(...)) (not that + # someone should do that, but it is possible) + + return _VisualBlock( + "serial", + [self.transformer], + names=[self.transformer.__class__.__name__], + name_details=[str(self.transformer)], + ) + def __getattr__(self, name): if name == "transformers_" and isinstance( getattr(self, "_wrapped_transformer", None), ApplyToSubFrame From ce9dc05d62f70b1307c9939962c9e1888c6e3afc Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 16 Jun 2026 12:19:42 +0200 Subject: [PATCH 17/20] addressing all missing files --- skrub/_agg_joiner.py | 7 ++++--- skrub/_apply_to_each_col.py | 6 ++++-- skrub/_apply_to_sub_frame.py | 6 ++++-- skrub/_check_input.py | 6 ++++-- skrub/_drop_similar.py | 5 +++-- skrub/_interpolation_joiner.py | 6 ++++-- skrub/_joiner.py | 6 ++++-- skrub/_matching.py | 5 +++-- skrub/_multi_agg_joiner.py | 5 +++-- skrub/_squashing_scaler.py | 4 ++-- 10 files changed, 35 insertions(+), 21 deletions(-) diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index 46ea4b06c..d03f54ef1 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -10,12 +10,13 @@ import numpy as np import pandas as pd -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted from skrub import _dataframe as sbd from skrub import _join_utils, _utils from skrub import selectors as s +from skrub._base import SkrubBaseTransformer from skrub._dispatch import dispatch, raise_dispatch_unregistered_type from ._check_input import CheckInputDataFrame @@ -168,7 +169,7 @@ def check_other_inputs(operations, suffix): return operations, suffix -class AggJoiner(TransformerMixin, BaseEstimator): +class AggJoiner(TransformerMixin, SkrubBaseTransformer): """Aggregate an auxiliary dataframe before joining it on a base dataframe. Apply numerical and categorical aggregation operations on the columns (i.e. `cols`) @@ -407,7 +408,7 @@ def get_feature_names_out(self): return self.all_outputs_ -class AggTarget(TransformerMixin, BaseEstimator): +class AggTarget(TransformerMixin, SkrubBaseTransformer): """Aggregate a target `y` before joining its aggregation on a base dataframe. Accepts :obj:`pandas.DataFrame` or :class:`polars.DataFrame` inputs. diff --git a/skrub/_apply_to_each_col.py b/skrub/_apply_to_each_col.py index 70e99cfdd..fcd46b66f 100644 --- a/skrub/_apply_to_each_col.py +++ b/skrub/_apply_to_each_col.py @@ -1,9 +1,11 @@ import itertools from joblib import Parallel, delayed -from sklearn.base import BaseEstimator, TransformerMixin, clone +from sklearn.base import TransformerMixin, clone from sklearn.utils.validation import check_is_fitted +from skrub._base import SkrubBaseTransformer + from . import _dataframe as sbd from . import _utils, selectors from ._join_utils import pick_column_names @@ -15,7 +17,7 @@ _SELECT_ALL_COLUMNS = selectors.all() -class ApplyToEachCol(BaseEstimator, TransformerMixin): +class ApplyToEachCol(SkrubBaseTransformer, TransformerMixin): """ Map a transformer to columns in a dataframe. diff --git a/skrub/_apply_to_sub_frame.py b/skrub/_apply_to_sub_frame.py index 793969eea..270465ae7 100644 --- a/skrub/_apply_to_sub_frame.py +++ b/skrub/_apply_to_sub_frame.py @@ -1,6 +1,8 @@ -from sklearn.base import BaseEstimator, TransformerMixin, clone +from sklearn.base import TransformerMixin, clone from sklearn.utils.validation import check_is_fitted +from skrub._base import SkrubBaseTransformer + from . import _dataframe as sbd from . import _utils, selectors from ._join_utils import pick_column_names @@ -11,7 +13,7 @@ _SELECT_ALL_COLUMNS = selectors.all() -class ApplyToSubFrame(TransformerMixin, BaseEstimator): +class ApplyToSubFrame(TransformerMixin, SkrubBaseTransformer): """Apply a transformer to part of a dataframe. A subset of the dataframe is selected and passed to the transformer (as a diff --git a/skrub/_check_input.py b/skrub/_check_input.py index c1dd86768..92dd206b4 100644 --- a/skrub/_check_input.py +++ b/skrub/_check_input.py @@ -2,9 +2,11 @@ import numpy as np import pandas as pd -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted +from skrub._base import SkrubBaseTransformer + from . import _dataframe as sbd from . import _join_utils, _utils from ._dispatch import dispatch @@ -72,7 +74,7 @@ def _check_is_dataframe(df): return df -class CheckInputDataFrame(TransformerMixin, BaseEstimator): +class CheckInputDataFrame(TransformerMixin, SkrubBaseTransformer): """Check the dataframe entering a skrub pipeline. This transformer ensures that: diff --git a/skrub/_drop_similar.py b/skrub/_drop_similar.py index 75906807e..207f70675 100644 --- a/skrub/_drop_similar.py +++ b/skrub/_drop_similar.py @@ -9,11 +9,12 @@ pass import numbers -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted from . import _dataframe as sbd from . import selectors as s +from ._base import SkrubBaseTransformer from ._column_associations import column_associations from ._dataframe._common import raise_dispatch_unregistered_type from ._dispatch import dispatch @@ -35,7 +36,7 @@ def _filter_associations_polars(obj, threshold): return obj.filter(pl.col("cramer_v") >= threshold) -class DropSimilar(TransformerMixin, BaseEstimator): +class DropSimilar(TransformerMixin, SkrubBaseTransformer): """Drop columns found too redundant to the rest of the dataframe, according to association defined by Cramér's V. diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index f08c97eb4..7775e021a 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -2,12 +2,14 @@ import joblib import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, clone +from sklearn.base import TransformerMixin, clone from sklearn.ensemble import ( HistGradientBoostingClassifier, HistGradientBoostingRegressor, ) +from skrub._base import SkrubBaseTransformer + from . import _dataframe as sbd from . import _join_utils, _utils from . import selectors as s @@ -20,7 +22,7 @@ DEFAULT_VECTORIZER = TableVectorizer(high_cardinality=MinHashEncoder()) -class InterpolationJoiner(TransformerMixin, BaseEstimator): +class InterpolationJoiner(TransformerMixin, SkrubBaseTransformer): """Join with a table augmented by machine-learning predictions. This is similar to a usual equi-join, but instead of looking for actual diff --git a/skrub/_joiner.py b/skrub/_joiner.py index 895f52bc3..90de69fb2 100644 --- a/skrub/_joiner.py +++ b/skrub/_joiner.py @@ -5,13 +5,15 @@ from functools import partial import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, clone +from sklearn.base import TransformerMixin, clone from sklearn.compose import make_column_transformer from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, StandardScaler from sklearn.utils.validation import check_is_fitted +from skrub._base import SkrubBaseTransformer + from . import _dataframe as sbd from . import _join_utils, _matching, _utils from . import selectors as s @@ -76,7 +78,7 @@ def _make_vectorizer(table, string_encoder, rescale): return make_pipeline(skrubber, make_column_transformer(*transformers)) -class Joiner(TransformerMixin, BaseEstimator): +class Joiner(TransformerMixin, SkrubBaseTransformer): """Augment features in a main table by fuzzy-joining an auxiliary table to it. This transformer is initialized with an auxiliary table `aux_table`. It diff --git a/skrub/_matching.py b/skrub/_matching.py index 9c9933eff..8f60db25f 100644 --- a/skrub/_matching.py +++ b/skrub/_matching.py @@ -1,11 +1,12 @@ import numpy as np from scipy import sparse -from sklearn.base import BaseEstimator from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state +from skrub._base import SkrubBaseTransformer -class Matching(BaseEstimator): + +class Matching(SkrubBaseTransformer): """Base class for fuzzy-join matching & distance rescaling. This class is a helper for the ``Joiner`` and ``fuzzy_join``. It is diff --git a/skrub/_multi_agg_joiner.py b/skrub/_multi_agg_joiner.py index 3c0dff5d8..ccca0720e 100644 --- a/skrub/_multi_agg_joiner.py +++ b/skrub/_multi_agg_joiner.py @@ -2,10 +2,11 @@ The MultiAggJoiner extends AggJoiner to multiple auxiliary tables. """ -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted from skrub._agg_joiner import AggJoiner +from skrub._base import SkrubBaseTransformer from skrub._dataframe import _common as sbd from skrub._utils import _is_array_like @@ -17,7 +18,7 @@ def _is_iterable_of_iterable_of_str(x): ) -class MultiAggJoiner(TransformerMixin, BaseEstimator): +class MultiAggJoiner(TransformerMixin, SkrubBaseTransformer): """Extension of the :class:`AggJoiner` to multiple auxiliary tables. Apply numerical and categorical aggregation operations on the `cols` diff --git a/skrub/_squashing_scaler.py b/skrub/_squashing_scaler.py index 4eae99355..f78f91609 100644 --- a/skrub/_squashing_scaler.py +++ b/skrub/_squashing_scaler.py @@ -1,7 +1,7 @@ import numbers import numpy as np -from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin +from sklearn.base import OneToOneFeatureMixin, TransformerMixin from sklearn.preprocessing import RobustScaler from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted @@ -52,7 +52,7 @@ def _soft_clip(X, max_absolute_value, mask_inf): return X -class _MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): +class _MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, SkrubBaseTransformer): """A variation of scikit-learn MinMaxScaler. A simple min-max scaler that centers the median to zero and scales From 0fe8479bbc133a462b5cf945dba9daa126bd0a9e Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 16 Jun 2026 13:04:49 +0200 Subject: [PATCH 18/20] fixing relative imports --- skrub/_agg_joiner.py | 2 +- skrub/_apply_to_each_col.py | 3 +-- skrub/_apply_to_sub_frame.py | 3 +-- skrub/_check_input.py | 3 +-- skrub/_interpolation_joiner.py | 3 +-- skrub/_joiner.py | 3 +-- skrub/_matching.py | 2 +- skrub/_multi_agg_joiner.py | 3 ++- skrub/_squashing_scaler.py | 3 ++- 9 files changed, 11 insertions(+), 14 deletions(-) diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index d03f54ef1..2826707a2 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -16,9 +16,9 @@ from skrub import _dataframe as sbd from skrub import _join_utils, _utils from skrub import selectors as s -from skrub._base import SkrubBaseTransformer from skrub._dispatch import dispatch, raise_dispatch_unregistered_type +from ._base import SkrubBaseTransformer from ._check_input import CheckInputDataFrame try: diff --git a/skrub/_apply_to_each_col.py b/skrub/_apply_to_each_col.py index fcd46b66f..36aa5b1ee 100644 --- a/skrub/_apply_to_each_col.py +++ b/skrub/_apply_to_each_col.py @@ -4,10 +4,9 @@ from sklearn.base import TransformerMixin, clone from sklearn.utils.validation import check_is_fitted -from skrub._base import SkrubBaseTransformer - from . import _dataframe as sbd from . import _utils, selectors +from ._base import SkrubBaseTransformer from ._join_utils import pick_column_names from ._single_column_transformer import RejectColumn, is_single_column_transformer diff --git a/skrub/_apply_to_sub_frame.py b/skrub/_apply_to_sub_frame.py index 270465ae7..1a0458fed 100644 --- a/skrub/_apply_to_sub_frame.py +++ b/skrub/_apply_to_sub_frame.py @@ -1,10 +1,9 @@ from sklearn.base import TransformerMixin, clone from sklearn.utils.validation import check_is_fitted -from skrub._base import SkrubBaseTransformer - from . import _dataframe as sbd from . import _utils, selectors +from ._base import SkrubBaseTransformer from ._join_utils import pick_column_names __all__ = ["ApplyToSubFrame"] diff --git a/skrub/_check_input.py b/skrub/_check_input.py index 92dd206b4..deb6d99ef 100644 --- a/skrub/_check_input.py +++ b/skrub/_check_input.py @@ -5,10 +5,9 @@ from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted -from skrub._base import SkrubBaseTransformer - from . import _dataframe as sbd from . import _join_utils, _utils +from ._base import SkrubBaseTransformer from ._dispatch import dispatch __all__ = ["CheckInputDataFrame", "cast_column_names_to_strings"] diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 7775e021a..4cc1e0ba9 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -8,11 +8,10 @@ HistGradientBoostingRegressor, ) -from skrub._base import SkrubBaseTransformer - from . import _dataframe as sbd from . import _join_utils, _utils from . import selectors as s +from ._base import SkrubBaseTransformer from ._minhash_encoder import MinHashEncoder from ._sklearn_compat import get_tags from ._table_vectorizer import TableVectorizer diff --git a/skrub/_joiner.py b/skrub/_joiner.py index 90de69fb2..de27526ca 100644 --- a/skrub/_joiner.py +++ b/skrub/_joiner.py @@ -12,11 +12,10 @@ from sklearn.preprocessing import FunctionTransformer, StandardScaler from sklearn.utils.validation import check_is_fitted -from skrub._base import SkrubBaseTransformer - from . import _dataframe as sbd from . import _join_utils, _matching, _utils from . import selectors as s +from ._base import SkrubBaseTransformer from ._check_input import CheckInputDataFrame from ._datetime_encoder import DatetimeEncoder from ._table_vectorizer import TableVectorizer diff --git a/skrub/_matching.py b/skrub/_matching.py index 8f60db25f..6b3d78772 100644 --- a/skrub/_matching.py +++ b/skrub/_matching.py @@ -3,7 +3,7 @@ from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from skrub._base import SkrubBaseTransformer +from ._base import SkrubBaseTransformer class Matching(SkrubBaseTransformer): diff --git a/skrub/_multi_agg_joiner.py b/skrub/_multi_agg_joiner.py index ccca0720e..6c78de957 100644 --- a/skrub/_multi_agg_joiner.py +++ b/skrub/_multi_agg_joiner.py @@ -6,10 +6,11 @@ from sklearn.utils.validation import check_is_fitted from skrub._agg_joiner import AggJoiner -from skrub._base import SkrubBaseTransformer from skrub._dataframe import _common as sbd from skrub._utils import _is_array_like +from ._base import SkrubBaseTransformer + def _is_iterable_of_iterable_of_str(x): "Return True if x is an iterable of iterable of str and False otherwise." diff --git a/skrub/_squashing_scaler.py b/skrub/_squashing_scaler.py index f78f91609..606eb96e6 100644 --- a/skrub/_squashing_scaler.py +++ b/skrub/_squashing_scaler.py @@ -5,9 +5,10 @@ from sklearn.preprocessing import RobustScaler from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted -from skrub._base import SkrubBaseTransformer from skrub._sklearn_compat import validate_data +from ._base import SkrubBaseTransformer + def _mask_inf(X): """Replace infinite values with NaN and return their sign.""" From 1752ffab0d740655571ecc2bbca92523ec61a055 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 16 Jun 2026 13:21:22 +0200 Subject: [PATCH 19/20] renaming to estimator, fixing order, adding to paramsearch --- skrub/_agg_joiner.py | 6 +++--- skrub/_apply_to_cols.py | 4 ++-- skrub/_apply_to_each_col.py | 4 ++-- skrub/_apply_to_sub_frame.py | 4 ++-- skrub/_base.py | 2 +- skrub/_check_input.py | 4 ++-- skrub/_data_ops/_estimator.py | 6 +++--- skrub/_drop_similar.py | 4 ++-- skrub/_interpolation_joiner.py | 4 ++-- skrub/_joiner.py | 4 ++-- skrub/_matching.py | 4 ++-- skrub/_multi_agg_joiner.py | 4 ++-- skrub/_select_cols.py | 6 +++--- skrub/_single_column_transformer.py | 4 ++-- skrub/_squashing_scaler.py | 6 +++--- skrub/_table_vectorizer.py | 6 +++--- 16 files changed, 36 insertions(+), 36 deletions(-) diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index 2826707a2..43e8d4db6 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -18,7 +18,7 @@ from skrub import selectors as s from skrub._dispatch import dispatch, raise_dispatch_unregistered_type -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._check_input import CheckInputDataFrame try: @@ -169,7 +169,7 @@ def check_other_inputs(operations, suffix): return operations, suffix -class AggJoiner(TransformerMixin, SkrubBaseTransformer): +class AggJoiner(TransformerMixin, SkrubBaseEstimator): """Aggregate an auxiliary dataframe before joining it on a base dataframe. Apply numerical and categorical aggregation operations on the columns (i.e. `cols`) @@ -408,7 +408,7 @@ def get_feature_names_out(self): return self.all_outputs_ -class AggTarget(TransformerMixin, SkrubBaseTransformer): +class AggTarget(TransformerMixin, SkrubBaseEstimator): """Aggregate a target `y` before joining its aggregation on a base dataframe. Accepts :obj:`pandas.DataFrame` or :class:`polars.DataFrame` inputs. diff --git a/skrub/_apply_to_cols.py b/skrub/_apply_to_cols.py index cd4eaf427..7168cf70b 100644 --- a/skrub/_apply_to_cols.py +++ b/skrub/_apply_to_cols.py @@ -8,14 +8,14 @@ from . import selectors from ._apply_to_each_col import ApplyToEachCol from ._apply_to_sub_frame import ApplyToSubFrame -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._sklearn_compat import _VisualBlock from ._wrap_transformer import wrap_transformer _SELECT_ALL_COLUMNS = selectors.all() -class ApplyToCols(TransformerMixin, SkrubBaseTransformer): +class ApplyToCols(TransformerMixin, SkrubBaseEstimator): """ Apply a transformer to selected columns in a dataframe. diff --git a/skrub/_apply_to_each_col.py b/skrub/_apply_to_each_col.py index 36aa5b1ee..dfb054291 100644 --- a/skrub/_apply_to_each_col.py +++ b/skrub/_apply_to_each_col.py @@ -6,7 +6,7 @@ from . import _dataframe as sbd from . import _utils, selectors -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._join_utils import pick_column_names from ._single_column_transformer import RejectColumn, is_single_column_transformer @@ -16,7 +16,7 @@ _SELECT_ALL_COLUMNS = selectors.all() -class ApplyToEachCol(SkrubBaseTransformer, TransformerMixin): +class ApplyToEachCol(TransformerMixin, SkrubBaseEstimator): """ Map a transformer to columns in a dataframe. diff --git a/skrub/_apply_to_sub_frame.py b/skrub/_apply_to_sub_frame.py index 1a0458fed..f8e595590 100644 --- a/skrub/_apply_to_sub_frame.py +++ b/skrub/_apply_to_sub_frame.py @@ -3,7 +3,7 @@ from . import _dataframe as sbd from . import _utils, selectors -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._join_utils import pick_column_names __all__ = ["ApplyToSubFrame"] @@ -12,7 +12,7 @@ _SELECT_ALL_COLUMNS = selectors.all() -class ApplyToSubFrame(TransformerMixin, SkrubBaseTransformer): +class ApplyToSubFrame(TransformerMixin, SkrubBaseEstimator): """Apply a transformer to part of a dataframe. A subset of the dataframe is selected and passed to the transformer (as a diff --git a/skrub/_base.py b/skrub/_base.py index 241f4470e..901fb46d9 100644 --- a/skrub/_base.py +++ b/skrub/_base.py @@ -1,7 +1,7 @@ from sklearn.base import BaseEstimator -class SkrubBaseTransformer(BaseEstimator): +class SkrubBaseEstimator(BaseEstimator): """Base class for all skrub transformers. This is a class that all skrub transformers inherit from. diff --git a/skrub/_check_input.py b/skrub/_check_input.py index deb6d99ef..972f9e3a7 100644 --- a/skrub/_check_input.py +++ b/skrub/_check_input.py @@ -7,7 +7,7 @@ from . import _dataframe as sbd from . import _join_utils, _utils -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._dispatch import dispatch __all__ = ["CheckInputDataFrame", "cast_column_names_to_strings"] @@ -73,7 +73,7 @@ def _check_is_dataframe(df): return df -class CheckInputDataFrame(TransformerMixin, SkrubBaseTransformer): +class CheckInputDataFrame(TransformerMixin, SkrubBaseEstimator): """Check the dataframe entering a skrub pipeline. This transformer ensures that: diff --git a/skrub/_data_ops/_estimator.py b/skrub/_data_ops/_estimator.py index d797f2dc3..b7bb598c1 100644 --- a/skrub/_data_ops/_estimator.py +++ b/skrub/_data_ops/_estimator.py @@ -16,7 +16,7 @@ from .. import _dataframe as sbd from .. import _join_utils -from .._base import SkrubBaseTransformer +from .._base import SkrubBaseEstimator from .._sklearn_compat import _safe_indexing, _VisualBlock from .._utils import set_module from . import _evaluation @@ -180,7 +180,7 @@ def _get_params_html(self, deep=True, doc_link=""): @set_module("skrub") -class SkrubLearner(_DataOpWrapperMixin, SkrubBaseTransformer): +class SkrubLearner(_DataOpWrapperMixin, SkrubBaseEstimator): """Learner that evaluates a skrub DataOp. This class is not meant to be instantiated manually, ``SkrubLearner`` @@ -1197,7 +1197,7 @@ def iter_cv_splits(data_op, environment, *, keep_subsampling=False, cv=None): yield split_info -class _BaseParamSearch(_DataOpWrapperMixin, BaseEstimator): +class _BaseParamSearch(_DataOpWrapperMixin, SkrubBaseEstimator): """Base class for hyperparameter search objects. It defines some default implementations for getting results, plotting, and diff --git a/skrub/_drop_similar.py b/skrub/_drop_similar.py index 207f70675..cd7f3d6ac 100644 --- a/skrub/_drop_similar.py +++ b/skrub/_drop_similar.py @@ -14,7 +14,7 @@ from . import _dataframe as sbd from . import selectors as s -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._column_associations import column_associations from ._dataframe._common import raise_dispatch_unregistered_type from ._dispatch import dispatch @@ -36,7 +36,7 @@ def _filter_associations_polars(obj, threshold): return obj.filter(pl.col("cramer_v") >= threshold) -class DropSimilar(TransformerMixin, SkrubBaseTransformer): +class DropSimilar(TransformerMixin, SkrubBaseEstimator): """Drop columns found too redundant to the rest of the dataframe, according to association defined by Cramér's V. diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 4cc1e0ba9..ea2f4e5d9 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -11,7 +11,7 @@ from . import _dataframe as sbd from . import _join_utils, _utils from . import selectors as s -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._minhash_encoder import MinHashEncoder from ._sklearn_compat import get_tags from ._table_vectorizer import TableVectorizer @@ -21,7 +21,7 @@ DEFAULT_VECTORIZER = TableVectorizer(high_cardinality=MinHashEncoder()) -class InterpolationJoiner(TransformerMixin, SkrubBaseTransformer): +class InterpolationJoiner(TransformerMixin, SkrubBaseEstimator): """Join with a table augmented by machine-learning predictions. This is similar to a usual equi-join, but instead of looking for actual diff --git a/skrub/_joiner.py b/skrub/_joiner.py index de27526ca..3b9642d7a 100644 --- a/skrub/_joiner.py +++ b/skrub/_joiner.py @@ -15,7 +15,7 @@ from . import _dataframe as sbd from . import _join_utils, _matching, _utils from . import selectors as s -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._check_input import CheckInputDataFrame from ._datetime_encoder import DatetimeEncoder from ._table_vectorizer import TableVectorizer @@ -77,7 +77,7 @@ def _make_vectorizer(table, string_encoder, rescale): return make_pipeline(skrubber, make_column_transformer(*transformers)) -class Joiner(TransformerMixin, SkrubBaseTransformer): +class Joiner(TransformerMixin, SkrubBaseEstimator): """Augment features in a main table by fuzzy-joining an auxiliary table to it. This transformer is initialized with an auxiliary table `aux_table`. It diff --git a/skrub/_matching.py b/skrub/_matching.py index 6b3d78772..1a1a7d991 100644 --- a/skrub/_matching.py +++ b/skrub/_matching.py @@ -3,10 +3,10 @@ from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator -class Matching(SkrubBaseTransformer): +class Matching(SkrubBaseEstimator): """Base class for fuzzy-join matching & distance rescaling. This class is a helper for the ``Joiner`` and ``fuzzy_join``. It is diff --git a/skrub/_multi_agg_joiner.py b/skrub/_multi_agg_joiner.py index 6c78de957..7b99c5e38 100644 --- a/skrub/_multi_agg_joiner.py +++ b/skrub/_multi_agg_joiner.py @@ -9,7 +9,7 @@ from skrub._dataframe import _common as sbd from skrub._utils import _is_array_like -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator def _is_iterable_of_iterable_of_str(x): @@ -19,7 +19,7 @@ def _is_iterable_of_iterable_of_str(x): ) -class MultiAggJoiner(TransformerMixin, SkrubBaseTransformer): +class MultiAggJoiner(TransformerMixin, SkrubBaseEstimator): """Extension of the :class:`AggJoiner` to multiple auxiliary tables. Apply numerical and categorical aggregation operations on the `cols` diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py index 4dac4d1dc..e940d551b 100644 --- a/skrub/_select_cols.py +++ b/skrub/_select_cols.py @@ -1,11 +1,11 @@ from sklearn.base import TransformerMixin, check_is_fitted from . import selectors as s -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._single_column_transformer import SingleColumnTransformer -class SelectCols(TransformerMixin, SkrubBaseTransformer): +class SelectCols(TransformerMixin, SkrubBaseEstimator): """Select a subset of a DataFrame's columns. A ``ValueError`` is raised if any of the provided column names are not in the @@ -100,7 +100,7 @@ def get_feature_names_out(self, input_features=None): return self.columns_ -class DropCols(TransformerMixin, SkrubBaseTransformer): +class DropCols(TransformerMixin, SkrubBaseEstimator): """Drop a subset of a DataFrame's columns. The other columns are kept in their original order. A ``ValueError`` is raised if diff --git a/skrub/_single_column_transformer.py b/skrub/_single_column_transformer.py index f7f1e913d..ea6f52712 100644 --- a/skrub/_single_column_transformer.py +++ b/skrub/_single_column_transformer.py @@ -9,7 +9,7 @@ from . import _dataframe as sbd from . import _utils -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator __all__ = ["SingleColumnTransformer", "RejectColumn"] @@ -120,7 +120,7 @@ class RejectColumn(ValueError): pass -class SingleColumnTransformer(SkrubBaseTransformer): +class SingleColumnTransformer(SkrubBaseEstimator): """Base class for single-column transformers. Such transformers are applied independently to each column by diff --git a/skrub/_squashing_scaler.py b/skrub/_squashing_scaler.py index 606eb96e6..c823918cf 100644 --- a/skrub/_squashing_scaler.py +++ b/skrub/_squashing_scaler.py @@ -7,7 +7,7 @@ from skrub._sklearn_compat import validate_data -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator def _mask_inf(X): @@ -53,7 +53,7 @@ def _soft_clip(X, max_absolute_value, mask_inf): return X -class _MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, SkrubBaseTransformer): +class _MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, SkrubBaseEstimator): """A variation of scikit-learn MinMaxScaler. A simple min-max scaler that centers the median to zero and scales @@ -84,7 +84,7 @@ def transform(self, X): return self.scale_ * (X - self.median_) -class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, SkrubBaseTransformer): +class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, SkrubBaseEstimator): r"""Perform robust centering and scaling followed by soft clipping. When features have large outliers, smooth clipping prevents the outliers from diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 59910dafe..9af7bdee0 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -12,7 +12,7 @@ from . import _dataframe as sbd from . import _utils from . import selectors as s -from ._base import SkrubBaseTransformer +from ._base import SkrubBaseEstimator from ._check_input import CheckInputDataFrame from ._clean_categories import CleanCategories from ._clean_null_strings import CleanNullStrings @@ -184,7 +184,7 @@ def _get_preprocessors( return steps -class Cleaner(TransformerMixin, SkrubBaseTransformer): +class Cleaner(TransformerMixin, SkrubBaseEstimator): """Column-wise consistency checks and sanitization of dtypes, null values and dates. The ``Cleaner`` performs some consistency checks and basic preprocessing @@ -542,7 +542,7 @@ def get_feature_names_out(self, input_features=None): return np.asarray(self.all_outputs_) -class TableVectorizer(TransformerMixin, SkrubBaseTransformer): +class TableVectorizer(TransformerMixin, SkrubBaseEstimator): """Transform a dataframe to a numeric (vectorized) representation. This transformer preprocesses the given dataframe by first cleaning the data From e4458b43457b3eb643cde63fefe7035c69ecd67b Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> Date: Tue, 16 Jun 2026 13:21:45 +0200 Subject: [PATCH 20/20] Apply suggestion from @jeromedockes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Dockès --- skrub/_base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/skrub/_base.py b/skrub/_base.py index 241f4470e..3d140a79d 100644 --- a/skrub/_base.py +++ b/skrub/_base.py @@ -2,11 +2,12 @@ class SkrubBaseTransformer(BaseEstimator): - """Base class for all skrub transformers. + """Base class for all skrub estimators. This is a class that all skrub transformers inherit from. - For the moment, it's only used for the documentation url, but eventually - it will be used for other things as well. + For the moment, it's only used to set the documentation url for estimator diagrams. + + Think twice before adding anything to this class: it is a base class of *all* skrub estimators, including meta-estimators like ApplyToCols, the SingleColumnTransformer base class, and the SkrubLearners created by DataOps. """ _doc_link_module = "skrub"