Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ Changes
<jeromedockes>`.
- The ``exclude_cols`` of :meth:`DataOp.skb.apply` can now be a DataOp.
:pr:`2050` by :user:`Jérôme Dockès <jeromedockes>`.
- Skrub estimators now correctly show links to the documentation in the HTML
representation that is generated for notebooks. :pr:`2036` by :user:`Riccardo
Cappuzzo <rcap107>`.

Bugfixes
--------
Expand Down
18 changes: 16 additions & 2 deletions skrub/_apply_to_cols.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
based on the type of the transformer passed to it.
"""

from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
from sklearn.base import TransformerMixin, check_is_fitted

from . import selectors
from ._apply_to_each_col import ApplyToEachCol
from ._apply_to_sub_frame import ApplyToSubFrame
from ._base import BaseTransformer
from ._sklearn_compat import _VisualBlock
from ._wrap_transformer import wrap_transformer

_SELECT_ALL_COLUMNS = selectors.all()


class ApplyToCols(TransformerMixin, BaseEstimator):
class ApplyToCols(TransformerMixin, BaseTransformer):
"""
Apply a transformer to selected columns in a dataframe.

Expand Down Expand Up @@ -431,6 +433,18 @@ def get_feature_names_out(self, input_features=None):

return self._wrapped_transformer.get_feature_names_out(input_features)

def _sk_visual_block_(self):
# This is needed because when ApplyToCols is used with a transformer like

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm not sure i understood this comment, but also the scikit-learn diagram machinery is quite complicated so maybe it's not easy to explain in a short comment

# TableVectorizer then the estimator is shown as a parallel block, which
# would not add the documentation link.
# With this override the problem is fixed.
return _VisualBlock(
"serial",
[self.transformer],
names=[self.transformer.__class__.__name__],
name_details=[str(self.transformer)],
)

def __getattr__(self, name):
if name == "transformers_" and isinstance(
getattr(self, "_wrapped_transformer", None), ApplyToSubFrame
Expand Down
17 changes: 17 additions & 0 deletions skrub/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from sklearn.base import BaseEstimator


class BaseTransformer(BaseEstimator):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can call it SkrubBaseEstimator instead because the point is not transformer vs estimator, but that it should point to the skrub documentation.
also it applies to skrublearners which are not (always) transformers

it could also be SkrubEstimator but that is too similar to SkrubLearner

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also could you add a docstring to this class to say it is a base class for stuff shared by all estimators defined in skrub, which at the moment is only the documentation url

_doc_link_module = "skrub"

# Defining this as a property because it inherits from _HTMLDocumentationLinkMixin,
# which also defines _doc_link_template as a property, and we want to be able
# to override it.
@property
def _doc_link_template(self):
return getattr(
self,
"__doc_link_template",
"https://skrub-data.org/stable/reference/generated/"
"{estimator_module}.{estimator_name}.html",
)
11 changes: 11 additions & 0 deletions skrub/_data_ops/_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,17 @@ def describe_params(self):
"""
return describe_params(eval_choices(self.data_op), choice_graph(self.data_op))

_doc_link_module = "skrub"

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here also we could remove this and replace BaseEstimator by SkrubBaseEstimator as the base class right? also as the base class of _BaseParamSearch


@property
def _doc_link_template(self):
return getattr(
self,
"__doc_link_template",
"https://skrub-data.org/stable/reference/generated/"
"{estimator_module}.{estimator_name}.html",
)


def _to_Xy_pipeline(learner, environment):
return learner.__skrub_to_Xy_pipeline__(environment)
Expand Down
7 changes: 4 additions & 3 deletions skrub/_select_cols.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
from sklearn.base import TransformerMixin, check_is_fitted

from . import selectors as s
from ._base import BaseTransformer
from ._single_column_transformer import SingleColumnTransformer


class SelectCols(TransformerMixin, BaseEstimator):
class SelectCols(TransformerMixin, BaseTransformer):
"""Select a subset of a DataFrame's columns.

A ``ValueError`` is raised if any of the provided column names are not in the
Expand Down Expand Up @@ -99,7 +100,7 @@ def get_feature_names_out(self, input_features=None):
return self.columns_


class DropCols(TransformerMixin, BaseEstimator):
class DropCols(TransformerMixin, BaseTransformer):
"""Drop a subset of a DataFrame's columns.

The other columns are kept in their original order. A ``ValueError`` is raised if
Expand Down
4 changes: 2 additions & 2 deletions skrub/_single_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import re
import textwrap

from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _utils
from ._base import BaseTransformer

__all__ = ["SingleColumnTransformer", "RejectColumn"]

Expand Down Expand Up @@ -120,7 +120,7 @@ class RejectColumn(ValueError):
pass


class SingleColumnTransformer(BaseEstimator):
class SingleColumnTransformer(BaseTransformer):
"""Base class for single-column transformers.

Such transformers are applied independently to each column by
Expand Down
3 changes: 2 additions & 1 deletion skrub/_squashing_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sklearn.preprocessing import RobustScaler
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted

from skrub._base import BaseTransformer
from skrub._sklearn_compat import validate_data


Expand Down Expand Up @@ -82,7 +83,7 @@ def transform(self, X):
return self.scale_ * (X - self.median_)


class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
class SquashingScaler(OneToOneFeatureMixin, TransformerMixin, BaseTransformer):
r"""Perform robust centering and scaling followed by soft clipping.

When features have large outliers, smooth clipping prevents the outliers from
Expand Down
9 changes: 6 additions & 3 deletions skrub/_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
from collections.abc import Iterable

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.base import TransformerMixin, clone
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _utils
from . import selectors as s
from ._base import BaseTransformer
from ._check_input import CheckInputDataFrame
from ._clean_categories import CleanCategories
from ._clean_null_strings import CleanNullStrings
Expand All @@ -31,6 +32,8 @@


class PassThrough(SingleColumnTransformer):
_doc_link_module = ""

def fit_transform(self, column, y=None):
return column

Expand Down Expand Up @@ -181,7 +184,7 @@ def _get_preprocessors(
return steps


class Cleaner(TransformerMixin, BaseEstimator):
class Cleaner(TransformerMixin, BaseTransformer):
"""Column-wise consistency checks and sanitization of dtypes, null values and dates.

The ``Cleaner`` performs some consistency checks and basic preprocessing
Expand Down Expand Up @@ -539,7 +542,7 @@ def get_feature_names_out(self, input_features=None):
return np.asarray(self.all_outputs_)


class TableVectorizer(TransformerMixin, BaseEstimator):
class TableVectorizer(TransformerMixin, BaseTransformer):
"""Transform a dataframe to a numeric (vectorized) representation.

This transformer preprocesses the given dataframe by first cleaning the data
Expand Down
53 changes: 53 additions & 0 deletions skrub/tests/test_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import re

from sklearn.utils import estimator_html_repr

from skrub import (
ApplyToCols,
Cleaner,
DropCols,
SelectCols,
StringEncoder,
TableVectorizer,
)


def test_doc_link_apply_to_cols():
"""The wrapped transformer's doc link appears in the HTML repr of ApplyToCols."""
html = estimator_html_repr(ApplyToCols(StringEncoder()))
links = set(re.findall(r'href="(https?://[^#"]+)"', html))
assert (
"https://skrub-data.org/stable/reference/generated/skrub.ApplyToCols.html"
in links
)

html = estimator_html_repr(ApplyToCols(TableVectorizer()))
links = set(re.findall(r'href="(https?://[^#"]+)"', html))
assert (
"https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html"
in links
)


def test_doc_link_skrub_class_select_cols():
"""Public skrub classes get a link to skrub documentation."""
link = SelectCols(cols=[])._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.SelectCols.html"
)
link = DropCols(cols=[])._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.DropCols.html"
)


def test_doc_link_table_vectorizer():
"""Public skrub classes get a link to skrub documentation."""
link = TableVectorizer()._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.TableVectorizer.html"
)
link = Cleaner()._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.Cleaner.html"
)
27 changes: 27 additions & 0 deletions skrub/tests/test_single_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

from skrub import GapEncoder
from skrub import _dataframe as sbd
from skrub._single_column_transformer import (
SingleColumnTransformer,
Expand Down Expand Up @@ -91,6 +92,32 @@ def fit(self, column, y=None):
assert transformer.get_feature_names_out() == [sbd.name(column)]


def test_doc_link_skrub_class():
"""Public skrub classes get a link to skrub documentation."""
link = GapEncoder()._get_doc_link()
assert link == (
"https://skrub-data.org/stable/reference/generated/skrub.GapEncoder.html"
)


def test_doc_link_user_defined_subclass():
"""User-defined subclasses outside skrub.* produce no link."""

class MyTransformer(SingleColumnTransformer):
def fit_transform(self, column, y=None):
return column

def transform(self, column):
return column

# Needed to simulate a user-defined class outside of skrub.*.
# Since this test is running in a module named
# "skrub.tests.test_single_column_transformer", that is the default modulee
# for MyTransformer, which would cause a doc link to be generated.
MyTransformer.__module__ = "user_package"
assert MyTransformer()._get_doc_link() == ""


def test_is_single_column_transformer():
class S:
__single_column_transformer__ = True
Expand Down
Loading