Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
5fcf629
ENH - adding doc link to html repr of estimators
rcap107 Apr 21, 2026
baf63fb
moving the new methods
rcap107 Apr 21, 2026
0fde8fd
adding more
rcap107 Apr 21, 2026
e40ecfa
fixing applytocols
rcap107 Apr 21, 2026
3156828
fixing typo
rcap107 Apr 21, 2026
8213776
adding tests
rcap107 Apr 21, 2026
4a41b3f
adding a comment
rcap107 Apr 21, 2026
b3403c0
changelog
rcap107 Apr 21, 2026
ade23b8
removing unneeded setter
rcap107 Apr 21, 2026
fe0fc13
adding more tests for coverage
rcap107 Apr 22, 2026
7f774ea
Merge branch 'main' into enh-add-doc-link-to-estimator
rcap107 Apr 23, 2026
cb48d16
Merge remote-tracking branch 'upstream/HEAD' into enh-add-doc-link-to…
rcap107 Jun 8, 2026
c2eb8b5
Merge branch 'enh-add-doc-link-to-estimator' of github.com:rcap107/sk…
rcap107 Jun 8, 2026
1f1a828
moving changes to a single file
rcap107 Jun 8, 2026
79cc5e9
_
rcap107 Jun 8, 2026
48949d2
tests
rcap107 Jun 8, 2026
f1f07d9
removing unnecessary code
rcap107 Jun 8, 2026
ea59f93
Merge remote-tracking branch 'upstream/HEAD' into enh-add-doc-link-to…
rcap107 Jun 16, 2026
a1f520f
addressing comments from review
rcap107 Jun 16, 2026
20702bd
bringing back code block and better comment
rcap107 Jun 16, 2026
ce9dc05
addressing all missing files
rcap107 Jun 16, 2026
0fe8479
fixing relative imports
rcap107 Jun 16, 2026
1752ffa
renaming to estimator, fixing order, adding to paramsearch
rcap107 Jun 16, 2026
e4458b4
Apply suggestion from @jeromedockes
rcap107 Jun 16, 2026
d601e99
Merge branch 'enh-add-doc-link-to-estimator' of github.com:rcap107/sk…
rcap107 Jun 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ Changes
<jeromedockes>`.
- The ``exclude_cols`` of :meth:`DataOp.skb.apply` can now be a DataOp.
:pr:`2050` by :user:`Jérôme Dockès <jeromedockes>`.
- Skrub estimators now correctly show links to the documentation in the HTML
representation that is generated for notebooks. :pr:`2036` by :user:`Riccardo
Cappuzzo <rcap107>`.

Bugfixes
--------
Expand Down
7 changes: 4 additions & 3 deletions skrub/_agg_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_is_fitted

from skrub import _dataframe as sbd
from skrub import _join_utils, _utils
from skrub import selectors as s
from skrub._dispatch import dispatch, raise_dispatch_unregistered_type

from ._base import SkrubBaseTransformer
from ._check_input import CheckInputDataFrame

try:
Expand Down Expand Up @@ -168,7 +169,7 @@ def check_other_inputs(operations, suffix):
return operations, suffix


class AggJoiner(TransformerMixin, BaseEstimator):
class AggJoiner(TransformerMixin, SkrubBaseTransformer):
"""Aggregate an auxiliary dataframe before joining it on a base dataframe.

Apply numerical and categorical aggregation operations on the columns (i.e. `cols`)
Expand Down Expand Up @@ -407,7 +408,7 @@ def get_feature_names_out(self):
return self.all_outputs_


class AggTarget(TransformerMixin, BaseEstimator):
class AggTarget(TransformerMixin, SkrubBaseTransformer):
"""Aggregate a target `y` before joining its aggregation on a base dataframe.

Accepts :obj:`pandas.DataFrame` or :class:`polars.DataFrame` inputs.
Expand Down
20 changes: 18 additions & 2 deletions skrub/_apply_to_cols.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
based on the type of the transformer passed to it.
"""

from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
from sklearn.base import TransformerMixin, check_is_fitted

from . import selectors
from ._apply_to_each_col import ApplyToEachCol
from ._apply_to_sub_frame import ApplyToSubFrame
from ._base import SkrubBaseTransformer
from ._sklearn_compat import _VisualBlock
from ._wrap_transformer import wrap_transformer

_SELECT_ALL_COLUMNS = selectors.all()


class ApplyToCols(TransformerMixin, BaseEstimator):
class ApplyToCols(TransformerMixin, SkrubBaseTransformer):
"""
Apply a transformer to selected columns in a dataframe.

Expand Down Expand Up @@ -431,6 +433,20 @@ def get_feature_names_out(self, input_features=None):

return self._wrapped_transformer.get_feature_names_out(input_features)

def _sk_visual_block_(self):
# This is needed because cases like ApplyToCols(TableVectorizer())

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@glemaitre could you give a bit of context on why this is needed here?

# would show the TableVectorizer as a parallel block, which would not
# add the documentation link. With this override the problem is fixed.
# The same problem happens for ApplyToCols(ApplyToCols(...)) (not that
# someone should do that, but it is possible)

return _VisualBlock(
"serial",
[self.transformer],
names=[self.transformer.__class__.__name__],
name_details=[str(self.transformer)],
)

def __getattr__(self, name):
if name == "transformers_" and isinstance(
getattr(self, "_wrapped_transformer", None), ApplyToSubFrame
Expand Down
5 changes: 3 additions & 2 deletions skrub/_apply_to_each_col.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import itertools

from joblib import Parallel, delayed
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.base import TransformerMixin, clone
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _utils, selectors
from ._base import SkrubBaseTransformer
from ._join_utils import pick_column_names
from ._single_column_transformer import RejectColumn, is_single_column_transformer

Expand All @@ -15,7 +16,7 @@
_SELECT_ALL_COLUMNS = selectors.all()


class ApplyToEachCol(BaseEstimator, TransformerMixin):
class ApplyToEachCol(SkrubBaseTransformer, TransformerMixin):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while we're at it let's put the mixin before the base class

"""
Map a transformer to columns in a dataframe.

Expand Down
5 changes: 3 additions & 2 deletions skrub/_apply_to_sub_frame.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.base import TransformerMixin, clone
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _utils, selectors
from ._base import SkrubBaseTransformer
from ._join_utils import pick_column_names

__all__ = ["ApplyToSubFrame"]
Expand All @@ -11,7 +12,7 @@
_SELECT_ALL_COLUMNS = selectors.all()


class ApplyToSubFrame(TransformerMixin, BaseEstimator):
class ApplyToSubFrame(TransformerMixin, SkrubBaseTransformer):
"""Apply a transformer to part of a dataframe.

A subset of the dataframe is selected and passed to the transformer (as a
Expand Down
24 changes: 24 additions & 0 deletions skrub/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from sklearn.base import BaseEstimator


class SkrubBaseTransformer(BaseEstimator):
"""Base class for all skrub transformers.

This is a class that all skrub transformers inherit from.
For the moment, it's only used for the documentation url, but eventually
it will be used for other things as well.
"""
Comment thread
rcap107 marked this conversation as resolved.
Outdated

_doc_link_module = "skrub"

# Defining this as a property because it inherits from _HTMLDocumentationLinkMixin,
# which also defines _doc_link_template as a property, and we want to be able
# to override it.
@property
def _doc_link_template(self):
return getattr(
self,
"__doc_link_template",
"https://skrub-data.org/stable/reference/generated/"
"{estimator_module}.{estimator_name}.html",
)
5 changes: 3 additions & 2 deletions skrub/_check_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _join_utils, _utils
from ._base import SkrubBaseTransformer
from ._dispatch import dispatch

__all__ = ["CheckInputDataFrame", "cast_column_names_to_strings"]
Expand Down Expand Up @@ -72,7 +73,7 @@ def _check_is_dataframe(df):
return df


class CheckInputDataFrame(TransformerMixin, BaseEstimator):
class CheckInputDataFrame(TransformerMixin, SkrubBaseTransformer):
"""Check the dataframe entering a skrub pipeline.

This transformer ensures that:
Expand Down
3 changes: 2 additions & 1 deletion skrub/_data_ops/_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from .. import _dataframe as sbd
from .. import _join_utils
from .._base import SkrubBaseTransformer
from .._sklearn_compat import _safe_indexing, _VisualBlock
from .._utils import set_module
from . import _evaluation
Expand Down Expand Up @@ -179,7 +180,7 @@ def _get_params_html(self, deep=True, doc_link=""):


@set_module("skrub")
class SkrubLearner(_DataOpWrapperMixin, BaseEstimator):
class SkrubLearner(_DataOpWrapperMixin, SkrubBaseTransformer):
"""Learner that evaluates a skrub DataOp.

This class is not meant to be instantiated manually, ``SkrubLearner``
Expand Down
10 changes: 10 additions & 0 deletions skrub/_data_ops/tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1460,3 +1460,13 @@ def load_data():
pred = X.skb.apply(DummyClassifier(), y=y)
search = pred.skb.make_grid_search(scoring="roc_auc").fit({})
assert search.results_.shape[0] == 1


def test_learner_docstring():
data_op, data = get_data_op_and_data("simple")
split = data_op.skb.train_test_split(data)
learner = data_op.skb.make_learner().fit(split["train"])
Comment on lines +1466 to +1468

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need it to be fitted? otherwise maybe we can slightly simplify & speedup

Suggested change
data_op, data = get_data_op_and_data("simple")
split = data_op.skb.train_test_split(data)
learner = data_op.skb.make_learner().fit(split["train"])
learner = skrub.var('a').skb.make_learner()

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whoops didn't see the comment, I'll add fix this when I'm cleaning up for the release

link = learner._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.SkrubLearner.html"
)
5 changes: 3 additions & 2 deletions skrub/_drop_similar.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
pass
import numbers

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import selectors as s
from ._base import SkrubBaseTransformer
from ._column_associations import column_associations
from ._dataframe._common import raise_dispatch_unregistered_type
from ._dispatch import dispatch
Expand All @@ -35,7 +36,7 @@ def _filter_associations_polars(obj, threshold):
return obj.filter(pl.col("cramer_v") >= threshold)


class DropSimilar(TransformerMixin, BaseEstimator):
class DropSimilar(TransformerMixin, SkrubBaseTransformer):
"""Drop columns found too redundant to the rest of the dataframe,
according to association defined by Cramér's V.

Expand Down
5 changes: 3 additions & 2 deletions skrub/_interpolation_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import joblib
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.base import TransformerMixin, clone
from sklearn.ensemble import (
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
Expand All @@ -11,6 +11,7 @@
from . import _dataframe as sbd
from . import _join_utils, _utils
from . import selectors as s
from ._base import SkrubBaseTransformer
from ._minhash_encoder import MinHashEncoder
from ._sklearn_compat import get_tags
from ._table_vectorizer import TableVectorizer
Expand All @@ -20,7 +21,7 @@
DEFAULT_VECTORIZER = TableVectorizer(high_cardinality=MinHashEncoder())


class InterpolationJoiner(TransformerMixin, BaseEstimator):
class InterpolationJoiner(TransformerMixin, SkrubBaseTransformer):
"""Join with a table augmented by machine-learning predictions.

This is similar to a usual equi-join, but instead of looking for actual
Expand Down
5 changes: 3 additions & 2 deletions skrub/_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from functools import partial

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.base import TransformerMixin, clone
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
Expand All @@ -15,6 +15,7 @@
from . import _dataframe as sbd
from . import _join_utils, _matching, _utils
from . import selectors as s
from ._base import SkrubBaseTransformer
from ._check_input import CheckInputDataFrame
from ._datetime_encoder import DatetimeEncoder
from ._table_vectorizer import TableVectorizer
Expand Down Expand Up @@ -76,7 +77,7 @@ def _make_vectorizer(table, string_encoder, rescale):
return make_pipeline(skrubber, make_column_transformer(*transformers))


class Joiner(TransformerMixin, BaseEstimator):
class Joiner(TransformerMixin, SkrubBaseTransformer):
"""Augment features in a main table by fuzzy-joining an auxiliary table to it.

This transformer is initialized with an auxiliary table `aux_table`. It
Expand Down
5 changes: 3 additions & 2 deletions skrub/_matching.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_random_state

from ._base import SkrubBaseTransformer

class Matching(BaseEstimator):

class Matching(SkrubBaseTransformer):
"""Base class for fuzzy-join matching & distance rescaling.

This class is a helper for the ``Joiner`` and ``fuzzy_join``. It is
Expand Down
6 changes: 4 additions & 2 deletions skrub/_multi_agg_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
The MultiAggJoiner extends AggJoiner to multiple auxiliary tables.
"""

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_is_fitted

from skrub._agg_joiner import AggJoiner
from skrub._dataframe import _common as sbd
from skrub._utils import _is_array_like

from ._base import SkrubBaseTransformer


def _is_iterable_of_iterable_of_str(x):
"Return True if x is an iterable of iterable of str and False otherwise."
Expand All @@ -17,7 +19,7 @@ def _is_iterable_of_iterable_of_str(x):
)


class MultiAggJoiner(TransformerMixin, BaseEstimator):
class MultiAggJoiner(TransformerMixin, SkrubBaseTransformer):
"""Extension of the :class:`AggJoiner` to multiple auxiliary tables.

Apply numerical and categorical aggregation operations on the `cols`
Expand Down
7 changes: 4 additions & 3 deletions skrub/_select_cols.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
from sklearn.base import TransformerMixin, check_is_fitted

from . import selectors as s
from ._base import SkrubBaseTransformer
from ._single_column_transformer import SingleColumnTransformer


class SelectCols(TransformerMixin, BaseEstimator):
class SelectCols(TransformerMixin, SkrubBaseTransformer):
"""Select a subset of a DataFrame's columns.

A ``ValueError`` is raised if any of the provided column names are not in the
Expand Down Expand Up @@ -99,7 +100,7 @@ def get_feature_names_out(self, input_features=None):
return self.columns_


class DropCols(TransformerMixin, BaseEstimator):
class DropCols(TransformerMixin, SkrubBaseTransformer):
"""Drop a subset of a DataFrame's columns.

The other columns are kept in their original order. A ``ValueError`` is raised if
Expand Down
4 changes: 2 additions & 2 deletions skrub/_single_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import re
import textwrap

from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _utils
from ._base import SkrubBaseTransformer

__all__ = ["SingleColumnTransformer", "RejectColumn"]

Expand Down Expand Up @@ -120,7 +120,7 @@ class RejectColumn(ValueError):
pass


class SingleColumnTransformer(BaseEstimator):
class SingleColumnTransformer(SkrubBaseTransformer):
"""Base class for single-column transformers.

Such transformers are applied independently to each column by
Expand Down
Loading