From 15666fd357cdeba6db5e1e5339a5e68492a5e1f7 Mon Sep 17 00:00:00 2001
From: Maureen <githaigamaureen@gmail.com>
Date: Thu, 28 May 2026 19:05:46 +0300
Subject: [PATCH 1/4] grouping examples into sections

---
 examples/{ => data_ops}/0100_squashing_scaler.py                  | 0
 .../07_grid_searching_with_the_tablevectorizer.py                 | 0
 examples/{ => encoders}/0010_encodings.py                         | 0
 examples/{ => encoders}/0020_text_with_string_encoders.py         | 0
 examples/{ => encoders}/0030_datetime_encoder.py                  | 0
 examples/{ => joiners_and_utils}/0050_deduplication.py            | 0
 examples/{ => joiners_and_utils}/0090_apply_to_cols.py            | 0
 examples/{ => joiners_and_utils/joiners}/0040_fuzzy_joining.py    | 0
 .../{ => joiners_and_utils/joiners}/0060_multiple_key_join.py     | 0
 examples/{ => joiners_and_utils/joiners}/0070_join_aggregation.py | 0
 .../{ => joiners_and_utils/joiners}/0080_interpolation_join.py    | 0
 .../joiners}/08_join_aggregation_full.py                          | 0
 12 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/{ => data_ops}/0100_squashing_scaler.py (100%)
 rename examples/{FIXME => data_ops}/07_grid_searching_with_the_tablevectorizer.py (100%)
 rename examples/{ => encoders}/0010_encodings.py (100%)
 rename examples/{ => encoders}/0020_text_with_string_encoders.py (100%)
 rename examples/{ => encoders}/0030_datetime_encoder.py (100%)
 rename examples/{ => joiners_and_utils}/0050_deduplication.py (100%)
 rename examples/{ => joiners_and_utils}/0090_apply_to_cols.py (100%)
 rename examples/{ => joiners_and_utils/joiners}/0040_fuzzy_joining.py (100%)
 rename examples/{ => joiners_and_utils/joiners}/0060_multiple_key_join.py (100%)
 rename examples/{ => joiners_and_utils/joiners}/0070_join_aggregation.py (100%)
 rename examples/{ => joiners_and_utils/joiners}/0080_interpolation_join.py (100%)
 rename examples/{FIXME => joiners_and_utils/joiners}/08_join_aggregation_full.py (100%)

diff --git a/examples/0100_squashing_scaler.py b/examples/data_ops/0100_squashing_scaler.py
similarity index 100%
rename from examples/0100_squashing_scaler.py
rename to examples/data_ops/0100_squashing_scaler.py
diff --git a/examples/FIXME/07_grid_searching_with_the_tablevectorizer.py b/examples/data_ops/07_grid_searching_with_the_tablevectorizer.py
similarity index 100%
rename from examples/FIXME/07_grid_searching_with_the_tablevectorizer.py
rename to examples/data_ops/07_grid_searching_with_the_tablevectorizer.py
diff --git a/examples/0010_encodings.py b/examples/encoders/0010_encodings.py
similarity index 100%
rename from examples/0010_encodings.py
rename to examples/encoders/0010_encodings.py
diff --git a/examples/0020_text_with_string_encoders.py b/examples/encoders/0020_text_with_string_encoders.py
similarity index 100%
rename from examples/0020_text_with_string_encoders.py
rename to examples/encoders/0020_text_with_string_encoders.py
diff --git a/examples/0030_datetime_encoder.py b/examples/encoders/0030_datetime_encoder.py
similarity index 100%
rename from examples/0030_datetime_encoder.py
rename to examples/encoders/0030_datetime_encoder.py
diff --git a/examples/0050_deduplication.py b/examples/joiners_and_utils/0050_deduplication.py
similarity index 100%
rename from examples/0050_deduplication.py
rename to examples/joiners_and_utils/0050_deduplication.py
diff --git a/examples/0090_apply_to_cols.py b/examples/joiners_and_utils/0090_apply_to_cols.py
similarity index 100%
rename from examples/0090_apply_to_cols.py
rename to examples/joiners_and_utils/0090_apply_to_cols.py
diff --git a/examples/0040_fuzzy_joining.py b/examples/joiners_and_utils/joiners/0040_fuzzy_joining.py
similarity index 100%
rename from examples/0040_fuzzy_joining.py
rename to examples/joiners_and_utils/joiners/0040_fuzzy_joining.py
diff --git a/examples/0060_multiple_key_join.py b/examples/joiners_and_utils/joiners/0060_multiple_key_join.py
similarity index 100%
rename from examples/0060_multiple_key_join.py
rename to examples/joiners_and_utils/joiners/0060_multiple_key_join.py
diff --git a/examples/0070_join_aggregation.py b/examples/joiners_and_utils/joiners/0070_join_aggregation.py
similarity index 100%
rename from examples/0070_join_aggregation.py
rename to examples/joiners_and_utils/joiners/0070_join_aggregation.py
diff --git a/examples/0080_interpolation_join.py b/examples/joiners_and_utils/joiners/0080_interpolation_join.py
similarity index 100%
rename from examples/0080_interpolation_join.py
rename to examples/joiners_and_utils/joiners/0080_interpolation_join.py
diff --git a/examples/FIXME/08_join_aggregation_full.py b/examples/joiners_and_utils/joiners/08_join_aggregation_full.py
similarity index 100%
rename from examples/FIXME/08_join_aggregation_full.py
rename to examples/joiners_and_utils/joiners/08_join_aggregation_full.py

From f07f82464c60de9cfe3d54b9fe580504c03ae969 Mon Sep 17 00:00:00 2001
From: Maureen <githaigamaureen@gmail.com>
Date: Thu, 28 May 2026 19:22:50 +0300
Subject: [PATCH 2/4] updated changelog

---
 CHANGES.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index d6544a250..c00034661 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -19,9 +19,8 @@ New Features
 
 Changes
 -------
-- An unnecessary warning that was raised when passing a numpy array to the
-  TableVectorizer has been removed. :pr:`1908` by
-  :user:`Sandrine Henry <sandrineh>`.
+- Grouped Examples into three sections. :pr:`` by
+  :user:`Maureen Githaiga <maureen-githaiga>`.
 
 Bugfixes
 --------

From 3f0502f02398f882e6bed3532a6e24659d7157b1 Mon Sep 17 00:00:00 2001
From: Riccardo Cappuzzo <riccardo.cappuzzo@gmail.com>
Date: Mon, 15 Jun 2026 13:27:03 +0200
Subject: [PATCH 3/4] reordering examples

---
 CHANGES.rst                                   |   2 +-
 ...apply_to_cols.py => 0010_apply_to_cols.py} |   0
 .../0050_deduplication.py                     |   0
 .../0010_encodings.py                         |   0
 .../0020_text_with_string_encoders.py         |   0
 .../0030_datetime_encoder.py                  |   0
 examples/01_encoding/GALLERY_HEADER.rst       |   2 +
 .../0100_squashing_scaler.py                  |   0
 ...grid_searching_with_the_tablevectorizer.py |   0
 .../1120_multiple_tables.py                   |   0
 .../{data_ops => 02_data_ops}/1130_choices.py |   0
 .../1131_optuna_choices.py                    |   0
 .../1140_subsampling.py                       |   0
 .../1150_use_case.py                          |   0
 .../{data_ops => 02_data_ops}/1160_pytorch.py |   0
 .../GALLERY_HEADER.rst                        |   0
 .../0040_fuzzy_joining.py                     |   0
 .../0060_multiple_key_join.py                 |   0
 .../0070_join_aggregation.py                  |   0
 .../0080_interpolation_join.py                |   0
 examples/03_joining/GALLERY_HEADER.rst        |   2 +
 .../joiners/08_join_aggregation_full.py       | 549 ------------------
 22 files changed, 5 insertions(+), 550 deletions(-)
 rename examples/{joiners_and_utils/0090_apply_to_cols.py => 0010_apply_to_cols.py} (100%)
 rename examples/{joiners_and_utils => }/0050_deduplication.py (100%)
 rename examples/{encoders => 01_encoding}/0010_encodings.py (100%)
 rename examples/{encoders => 01_encoding}/0020_text_with_string_encoders.py (100%)
 rename examples/{encoders => 01_encoding}/0030_datetime_encoder.py (100%)
 create mode 100644 examples/01_encoding/GALLERY_HEADER.rst
 rename examples/{data_ops => 02_data_ops}/0100_squashing_scaler.py (100%)
 rename examples/{data_ops => 02_data_ops}/07_grid_searching_with_the_tablevectorizer.py (100%)
 rename examples/{data_ops => 02_data_ops}/1120_multiple_tables.py (100%)
 rename examples/{data_ops => 02_data_ops}/1130_choices.py (100%)
 rename examples/{data_ops => 02_data_ops}/1131_optuna_choices.py (100%)
 rename examples/{data_ops => 02_data_ops}/1140_subsampling.py (100%)
 rename examples/{data_ops => 02_data_ops}/1150_use_case.py (100%)
 rename examples/{data_ops => 02_data_ops}/1160_pytorch.py (100%)
 rename examples/{data_ops => 02_data_ops}/GALLERY_HEADER.rst (100%)
 rename examples/{joiners_and_utils/joiners => 03_joining}/0040_fuzzy_joining.py (100%)
 rename examples/{joiners_and_utils/joiners => 03_joining}/0060_multiple_key_join.py (100%)
 rename examples/{joiners_and_utils/joiners => 03_joining}/0070_join_aggregation.py (100%)
 rename examples/{joiners_and_utils/joiners => 03_joining}/0080_interpolation_join.py (100%)
 create mode 100644 examples/03_joining/GALLERY_HEADER.rst
 delete mode 100644 examples/joiners_and_utils/joiners/08_join_aggregation_full.py

diff --git a/CHANGES.rst b/CHANGES.rst
index 9fa4f23e8..b6f91820c 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -22,7 +22,7 @@ New Features
 
 Changes
 -------
-- Grouped Examples into three sections. :pr:`690` by
+- Grouped Examples into three sections. :pr:`2102` by
   :user:`Maureen Githaiga <maureen-githaiga>`.
 - An unnecessary warning that was raised when passing a numpy array to the
   TableVectorizer has been removed. :pr:`1908` by
diff --git a/examples/joiners_and_utils/0090_apply_to_cols.py b/examples/0010_apply_to_cols.py
similarity index 100%
rename from examples/joiners_and_utils/0090_apply_to_cols.py
rename to examples/0010_apply_to_cols.py
diff --git a/examples/joiners_and_utils/0050_deduplication.py b/examples/0050_deduplication.py
similarity index 100%
rename from examples/joiners_and_utils/0050_deduplication.py
rename to examples/0050_deduplication.py
diff --git a/examples/encoders/0010_encodings.py b/examples/01_encoding/0010_encodings.py
similarity index 100%
rename from examples/encoders/0010_encodings.py
rename to examples/01_encoding/0010_encodings.py
diff --git a/examples/encoders/0020_text_with_string_encoders.py b/examples/01_encoding/0020_text_with_string_encoders.py
similarity index 100%
rename from examples/encoders/0020_text_with_string_encoders.py
rename to examples/01_encoding/0020_text_with_string_encoders.py
diff --git a/examples/encoders/0030_datetime_encoder.py b/examples/01_encoding/0030_datetime_encoder.py
similarity index 100%
rename from examples/encoders/0030_datetime_encoder.py
rename to examples/01_encoding/0030_datetime_encoder.py
diff --git a/examples/01_encoding/GALLERY_HEADER.rst b/examples/01_encoding/GALLERY_HEADER.rst
new file mode 100644
index 000000000..d79d44f0a
--- /dev/null
+++ b/examples/01_encoding/GALLERY_HEADER.rst
@@ -0,0 +1,2 @@
+Encoding features
+=================
diff --git a/examples/data_ops/0100_squashing_scaler.py b/examples/02_data_ops/0100_squashing_scaler.py
similarity index 100%
rename from examples/data_ops/0100_squashing_scaler.py
rename to examples/02_data_ops/0100_squashing_scaler.py
diff --git a/examples/data_ops/07_grid_searching_with_the_tablevectorizer.py b/examples/02_data_ops/07_grid_searching_with_the_tablevectorizer.py
similarity index 100%
rename from examples/data_ops/07_grid_searching_with_the_tablevectorizer.py
rename to examples/02_data_ops/07_grid_searching_with_the_tablevectorizer.py
diff --git a/examples/data_ops/1120_multiple_tables.py b/examples/02_data_ops/1120_multiple_tables.py
similarity index 100%
rename from examples/data_ops/1120_multiple_tables.py
rename to examples/02_data_ops/1120_multiple_tables.py
diff --git a/examples/data_ops/1130_choices.py b/examples/02_data_ops/1130_choices.py
similarity index 100%
rename from examples/data_ops/1130_choices.py
rename to examples/02_data_ops/1130_choices.py
diff --git a/examples/data_ops/1131_optuna_choices.py b/examples/02_data_ops/1131_optuna_choices.py
similarity index 100%
rename from examples/data_ops/1131_optuna_choices.py
rename to examples/02_data_ops/1131_optuna_choices.py
diff --git a/examples/data_ops/1140_subsampling.py b/examples/02_data_ops/1140_subsampling.py
similarity index 100%
rename from examples/data_ops/1140_subsampling.py
rename to examples/02_data_ops/1140_subsampling.py
diff --git a/examples/data_ops/1150_use_case.py b/examples/02_data_ops/1150_use_case.py
similarity index 100%
rename from examples/data_ops/1150_use_case.py
rename to examples/02_data_ops/1150_use_case.py
diff --git a/examples/data_ops/1160_pytorch.py b/examples/02_data_ops/1160_pytorch.py
similarity index 100%
rename from examples/data_ops/1160_pytorch.py
rename to examples/02_data_ops/1160_pytorch.py
diff --git a/examples/data_ops/GALLERY_HEADER.rst b/examples/02_data_ops/GALLERY_HEADER.rst
similarity index 100%
rename from examples/data_ops/GALLERY_HEADER.rst
rename to examples/02_data_ops/GALLERY_HEADER.rst
diff --git a/examples/joiners_and_utils/joiners/0040_fuzzy_joining.py b/examples/03_joining/0040_fuzzy_joining.py
similarity index 100%
rename from examples/joiners_and_utils/joiners/0040_fuzzy_joining.py
rename to examples/03_joining/0040_fuzzy_joining.py
diff --git a/examples/joiners_and_utils/joiners/0060_multiple_key_join.py b/examples/03_joining/0060_multiple_key_join.py
similarity index 100%
rename from examples/joiners_and_utils/joiners/0060_multiple_key_join.py
rename to examples/03_joining/0060_multiple_key_join.py
diff --git a/examples/joiners_and_utils/joiners/0070_join_aggregation.py b/examples/03_joining/0070_join_aggregation.py
similarity index 100%
rename from examples/joiners_and_utils/joiners/0070_join_aggregation.py
rename to examples/03_joining/0070_join_aggregation.py
diff --git a/examples/joiners_and_utils/joiners/0080_interpolation_join.py b/examples/03_joining/0080_interpolation_join.py
similarity index 100%
rename from examples/joiners_and_utils/joiners/0080_interpolation_join.py
rename to examples/03_joining/0080_interpolation_join.py
diff --git a/examples/03_joining/GALLERY_HEADER.rst b/examples/03_joining/GALLERY_HEADER.rst
new file mode 100644
index 000000000..3b3de2857
--- /dev/null
+++ b/examples/03_joining/GALLERY_HEADER.rst
@@ -0,0 +1,2 @@
+Joining tables with imperfect data
+==================================
diff --git a/examples/joiners_and_utils/joiners/08_join_aggregation_full.py b/examples/joiners_and_utils/joiners/08_join_aggregation_full.py
deleted file mode 100644
index 391ad6e31..000000000
--- a/examples/joiners_and_utils/joiners/08_join_aggregation_full.py
+++ /dev/null
@@ -1,549 +0,0 @@
-"""
-AggJoiner on a credit fraud dataset
-===================================
-
-In this example, we are tackling a fraudulent loan detection use case.
-Because fraud is rare, this dataset is extremely imbalanced, with a prevalence of around
-1.4%.
-
-Instead of focusing on arbitrary metrics like accuracy, we will derive a cost function
-based on (questionable) assumptions about the data. In a real-world scenario, we would
-need to consult with a domain expert within the company to develop a realistic utility
-function.
-
-The data consists of two distinct concepts: a "basket," which can be tagged as fraud (1)
-or not (0), and a list of "products." Each product has several attributes:
-
-- a category (marked by the column ``"item"``),
-- a model (``"model"``),
-- a brand (``"make"``),
-- a merchant code (``"goods_code"``),
-- a price per unit (``"cash_price"``),
-- a quantity selected in the basket (``"Nbr_of_prod_purchas"``)
-
-Since the number of products in each basket varies, the creators of this dataset have
-chosen to join all products and their attributes with their respective basket. They have
-arbitrarily decided to cut off the basket at the 24th product. However, since most
-baskets contain only one or two products, a large proportion of the columns are empty.
-Therefore, the dataset is very sparse, which is challenging from a machine learning
-perspective and also inefficient in terms of memory usage.
-
-.. |AggJoiner| replace::
-     :class:`~skrub.AggJoiner`
-
-.. |Joiner| replace::
-     :class:`~skrub.Joiner`
-
-.. |TableVectorizer| replace::
-     :class:`~skrub.TableVectorizer`
-
-.. |MinHashEncoder| replace::
-     :class:`~skrub.MinHashEncoder`
-
-.. |TargetEncoder| replace::
-     :class:`~sklearn.preprocessing.TargetEncoder`
-
-.. |make_pipeline| replace::
-     :func:`~sklearn.pipeline.make_pipeline`
-
-.. |Pipeline| replace::
-     :class:`~sklearn.pipeline.Pipeline`
-
-.. |HGBC| replace::
-     :class:`~sklearn.ensemble.HistGradientBoostingClassifier`
-
-.. |TunedThresholdClassifierCV| replace::
-     :class:`~sklearn.model_selection.TunedThresholdClassifierCV`
-
-.. |CalibrationDisplay| replace::
-     :class:`~sklearn.calibration.CalibrationDisplay`
-
-.. |pandas.melt| replace::
-     :func:`~pandas.melt`
-
-"""
-
-# %%
-# The data
-# --------
-#
-# We begin with loading the table from figshare. It has around 100k rows.
-from skrub.datasets import fetch_figshare
-
-X = fetch_figshare("48931237").X
-
-# %%
-# The total price is the sum of the price per unit of each product in the basket,
-# multiplied by their quantity. This will also allow us to define a utility function
-# later, in addition of being a useful feature for the learner.
-import numpy as np
-import pandas as pd
-
-from skrub import TableReport
-
-
-def total_price(X):
-    total_price = pd.Series(np.zeros(X.shape[0]), index=X.index, name="total_price")
-    max_item = 24
-    for idx in range(1, max_item + 1):
-        total_price += X[f"cash_price{idx}"].fillna(0) * X[
-            f"Nbr_of_prod_purchas{idx}"
-        ].fillna(0)
-
-    return total_price
-
-
-X["total_price"] = total_price(X)
-TableReport(X)
-
-# %%
-# Metrics
-# -------
-#
-# To consider the problem from a business perspective, we define our utility function
-# by the cost matrix in the function ``credit_gain_score``. False positive and false
-# negative predictions incur a negative gain.
-#
-# Ultimately, we want to maximize this metric. To do so, we can train our learner to
-# minimize a proper scoring rule like the log loss.
-import sklearn
-from sklearn.metrics import log_loss, make_scorer
-
-
-def credit_gain_score(y_true, y_pred, amount):
-    """Define our utility function.
-
-    These numbers are entirely made-up, don't try this at home!
-    """
-    mask_tn = (y_true == 0) & (y_pred == 0)
-    mask_fp = (y_true == 0) & (y_pred == 1)
-    mask_fn = (y_true == 1) & (y_pred == 0)
-
-    # Refusing a fraud yields 0 €
-    fraudulent_refuse = 0
-
-    # Accepting a fraud costs its whole amount
-    fraudulent_accept = -amount[mask_fn].sum()
-
-    # Refusing a legitimate basket transactions cost 5 €
-    legitimate_refuse = mask_fp.sum() * -5
-
-    # Accepting a legitimate basket transaction yields 7% of its amount
-    legitimate_accept = (amount[mask_tn] * 0.07).sum()
-
-    return fraudulent_refuse + fraudulent_accept + legitimate_refuse + legitimate_accept
-
-
-def get_results(model, X_test, y_test, threshold, amount, time_to_fit):
-    y_proba = model.predict_proba(X_test)[:, 1]
-    return {
-        "log_loss": log_loss(y_test, y_proba),
-        "gain_score": credit_gain_score(y_test, y_proba > threshold, amount),
-        "y_proba": y_proba,
-        "y_test": y_test,
-        "time_to_fit": time_to_fit,
-    }
-
-
-sklearn.set_config(enable_metadata_routing=True)
-gain_score = make_scorer(credit_gain_score).set_score_request(amount=True)
-
-results = dict()
-
-# %%
-# Dummy model
-# -----------
-#
-# We first evaluate the performance of a dummy model that always predict the negative
-# class (i.e. all transactions are legit).
-# This is a good sanity check to make sure our model actually learns something useful.
-from time import time
-
-from sklearn.dummy import DummyClassifier
-from sklearn.model_selection import train_test_split
-
-target_col = "fraud_flag"
-X_ = X.drop(columns=[target_col])
-y_ = X[target_col]
-
-X_train, X_test, y_train, y_test = train_test_split(
-    X_,
-    y_,
-    test_size=0.1,
-    stratify=y_,
-    random_state=0,
-)
-
-tic = time()
-dummy_negative = DummyClassifier(strategy="constant", constant=0).fit(X_train, y_train)
-time_to_fit = time() - tic
-
-results["Dummy Negative"] = get_results(
-    dummy_negative,
-    X_test,
-    y_test,
-    threshold=0.5,
-    amount=X_test["total_price"],
-    time_to_fit=time_to_fit,
-)
-
-# %%
-# Low effort estimator
-# --------------------
-#
-# Next, we use the |TableVectorizer| and an |HGBC| to create a very simple
-# baseline model that uses the sparse dataset directly.
-# Note that due to the large number of high
-# cardinality columns, we can't use a multi-dimensional encoder like the
-# |MinHashEncoder|, because the number of columns would then explode.
-#
-# Instead, we encode our categories with a |TargetEncoder|.
-#
-# We also further split the training set into a training and validation set for
-# post-training tuning in the post-training phase below.
-from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import TargetEncoder
-
-from skrub import TableVectorizer
-
-X_train_, X_val, y_train_, y_val = train_test_split(
-    X_train, y_train, test_size=0.1, stratify=y_train, random_state=0
-)
-
-low_effort = make_pipeline(
-    TableVectorizer(
-        high_cardinality=TargetEncoder(),
-    ),
-    HistGradientBoostingClassifier(),
-)
-
-tic = time()
-low_effort.fit(X_train_, y_train_)
-time_to_fit = time() - tic
-
-# %%
-# To maximise our utility function, we have to find the best classification threshold to
-# replace the default at 0.5. |TunedThresholdClassifierCV| is a scikit-learn
-# meta-estimator that is designed for this exact purpose.
-# More details in this `example from scikit-learn <https://scikit-learn.org/stable/auto_examples/model_selection/plot_cost_sensitive_learning.html#sphx-glr-auto-examples-model-selection-plot-cost-sensitive-learning-py>`_.
-#
-# We give it our trained model, and fit it on the validation dataset instead of the
-# training dataset to avoid overfitting. Notice that the scoring method is the utility
-# function, to which we pass the amount in ``fit``
-from sklearn.model_selection import TunedThresholdClassifierCV
-
-low_effort_tuned = TunedThresholdClassifierCV(
-    low_effort, cv="prefit", scoring=gain_score, refit=False
-).fit(X_val, y_val, amount=X_val["total_price"])
-
-results["Low effort"] = get_results(
-    low_effort,
-    X_test,
-    y_test,
-    threshold=low_effort_tuned.best_threshold_,
-    amount=X_test["total_price"],
-    time_to_fit=time_to_fit,
-)
-
-# %%
-# We define some plotting functions to display our results.
-import seaborn as sns
-from matplotlib import pyplot as plt
-from sklearn.calibration import CalibrationDisplay
-
-
-def plot_gain_tradeoff(results):
-    """Scatter plot of the score gain (y) vs the fit time (x) for each model."""
-
-    rows = []
-    for estimator_name, result in results.items():
-        result["estimator_name"] = estimator_name
-        rows.append(result)
-    df = pd.DataFrame(rows)
-
-    names = df["estimator_name"].values
-    palette = dict(zip(names, sns.color_palette("colorblind", n_colors=len(names))))
-
-    fig, ax = plt.subplots(figsize=(5, 4), dpi=100)
-    sns.scatterplot(
-        df,
-        x="time_to_fit",
-        y="gain_score",
-        hue="estimator_name",
-        style="estimator_name",
-        ax=ax,
-        palette=palette,
-        s=200,
-    )
-    ax.grid()
-
-    ticks = df["time_to_fit"].round(3).tolist()
-    labels = [f"{tick}s" for tick in ticks]
-    ax.set_xticks(ticks, labels)
-
-    ticks = df["gain_score"].round().tolist()
-    ticks.insert(1, 650_000)
-    labels = [f"{tick:,} €" for tick in ticks]
-
-    ax.set_yticks(ticks, labels)
-    ax.set_ylabel("Gain score")
-    ax.set_xlabel("Time to fit")
-    ax.set_title("Gain score vs Time to fit")
-    plt.tight_layout()
-
-
-def plot_calibration_curve(results):
-    """Plot a calibration curve and the log-loss."""
-
-    estimator_names = list(results)
-    palette = dict(
-        zip(
-            estimator_names,
-            sns.color_palette("colorblind", n_colors=len(estimator_names)),
-        )
-    )
-    fig, ax = plt.subplots(figsize=(6, 4), dpi=100)
-    for name, result in results.items():
-        log_loss = str(round(result["log_loss"], 4))
-        label = f"{name}, {'log_loss: ' + log_loss}"
-        CalibrationDisplay.from_predictions(
-            y_true=result["y_test"],
-            y_prob=result["y_proba"],
-            strategy="quantile",
-            label=label,
-            ax=ax,
-            color=palette[name],
-            n_bins=15,
-        )
-    ax.set_xlim([-0.001, 0.13])
-    ax.set_ylim([-0.001, 0.13])
-    ax.set_title("Calibration curve")
-
-
-# %%
-# We see below that the low effort classifier significantly improves our gains compared
-# to the dummy baseline. The former is of course slower to train than the latter.
-
-plot_gain_tradeoff(results)
-
-
-# %%
-# We also evaluate the calibration of both models. As very few classes are
-# positive, we can expect all probabilities to be close to 0. We have to
-# zoom on it, and use the "quantile" strategy of |CalibrationDisplay| in order to create
-# bins containing an equal number of samples.
-
-plot_calibration_curve(results)
-
-
-# %%
-# Agg-Joiner based estimator
-# --------------------------
-#
-# We first need to split the dataframe between a dataframe representing baskets and a
-# dataframe representing products. In other words, we need to revert the join operation
-# performed by the creator of this dataset. Conceptually, this is close to a
-# |pandas.melt| operation
-#
-# Note that we don't keep the product ordering information, which is probably not an
-# important feature here.
-
-
-def get_columns_at(idx, cols_2_idx):
-    """Small helper that give the position of each of the columns of the idx-th \
-        product."""
-    cols = [
-        "ID",
-        target_col,
-        f"item{idx}",
-        f"cash_price{idx}",
-        f"make{idx}",
-        f"model{idx}",
-        f"goods_code{idx}",
-        f"Nbr_of_prod_purchas{idx}",
-    ]
-    return [cols_2_idx[col] for col in cols]
-
-
-def melt_multi_columns(X):
-    """Create a dataframe where each product is a row."""
-    products = []
-    cols_2_idx = dict(zip(X.columns, range(X.shape[1])))
-    for row in X.values:
-        n_products = min(row[cols_2_idx["Nb_of_items"]], 24)
-        for idx in range(1, n_products + 1):
-            cols = get_columns_at(idx, cols_2_idx)
-            products.append(row[cols])
-
-    cols = [
-        "ID",
-        target_col,
-        "item",
-        "cash_price",
-        "make",
-        "model",
-        "goods_code",
-        "Nbr_of_prod_purchas",
-    ]
-
-    products = pd.DataFrame(products, columns=cols)
-
-    for col in ["make", "model"]:
-        products[col] = products[col].fillna("None")
-
-    return products
-
-
-X_train_[target_col] = y_train_
-X_val[target_col] = y_val
-X_test[target_col] = y_test
-
-baskets_train = X_train_[["ID", "total_price", target_col]]
-baskets_val = X_val[["ID", "total_price", target_col]]
-baskets_test = X_test[["ID", "total_price", target_col]]
-
-products = melt_multi_columns(X)
-
-TableReport(products)
-
-# %%
-# We have to aggregate the products dataframe before joining it back to the basket
-# dataframe. Prior to that, we need to apply some preprocessing to deal with
-# the high cardinality columns. Since these columns have some morphological variations
-# and typos, we use the |MinHashEncoder|.
-#
-# ``goods_code`` is slightly different, as it represents some merchant IDs, which
-# co-occurs for different products. Therefore, we encode it with a |TargetEncoder| as
-# we previously did.
-#
-# To later perform the joiner operation, we must keep the basket ``ID`` with
-# ``"passthrough"``.
-from skrub import MinHashEncoder
-
-
-def get_X_y(data):
-    return data.drop(columns=[target_col]), data[target_col]
-
-
-tic = time()
-vectorizer = TableVectorizer(
-    high_cardinality=MinHashEncoder(),  # applied on ["item", "model", "make"]
-    specific_transformers=[
-        (TargetEncoder(), ["goods_code"]),
-        ("passthrough", ["ID"]),
-    ],
-)
-
-products_transformed = vectorizer.fit_transform(*get_X_y(products))
-time_to_fit = time() - tic
-
-TableReport(products_transformed)
-
-# %%
-# Let's now detail how to leverage |AggJoiner| here. We have just encoded each product
-# attributes, and now we need to somehow aggregate these product encodings into their
-# respective baskets.
-#
-# By aggregating instead of concatenating, we obtain an invariant number of columns,
-# and we remove the sparsity of the dataset.
-#
-# But which aggregation operation should we choose? Since the |MinHashEncoder| hashes
-# ngrams with different hashing functions and return their minimum, it makes sense to
-# aggregate different product encodings using their **minimum** for each dimension.
-# You can view MinHash minimums as activations.
-#
-# For numeric columns and columns encoded with the |TargetEncoder|, we take the mean,
-# standard deviation, minimum and maximum to extract a representative summary of each
-# distribution.
-#
-# We can apply these two sets of operations by chaining together two |AggJoiner| in
-# a |Pipeline| using |make_pipeline|. We also make use of skrub selectors to select
-# columns with the ``glob`` syntax.
-#
-# We need to pass the product dataframe as an auxiliary table argument to AggJoiner
-# in ``__init__``. The basket dataframe is our main table, and we pass it during
-# ``fit``. We discuss the limitations of this design in the conclusion at the bottom
-# of this notebook.
-#
-# Let's display the output of this preprocessing pipeline.
-
-from sklearn.pipeline import make_pipeline
-
-from skrub import AggJoiner
-from skrub import selectors as s
-
-minhash_cols = "ID" | s.glob("item_*") | s.glob("model_*") | s.glob("make_*")
-single_cols = ["ID", "goods_code", "Nbr_of_prod_purchas", "cash_price"]
-
-pipe_agg_joiner = make_pipeline(
-    AggJoiner(
-        aux_table=s.select(products_transformed, minhash_cols),
-        key="ID",
-        operations=["min"],
-    ),
-    AggJoiner(
-        aux_table=s.select(products_transformed, single_cols),
-        key="ID",
-        operations=["mean", "sum", "std", "min", "max"],
-    ),
-)
-basket_train_transformed = pipe_agg_joiner.fit_transform(baskets_train)
-
-TableReport(basket_train_transformed)
-
-# %%
-# Now that we get a sense of how the |AggJoiner| can help us, we complete this pipeline
-# with an |HGBC| and evaluate our final model.
-
-tic = time()
-agg_join_estimator = make_pipeline(
-    pipe_agg_joiner,
-    HistGradientBoostingClassifier(),
-).fit(*get_X_y(baskets_train))
-time_to_fit += time() - tic
-
-agg_join_tuned = TunedThresholdClassifierCV(
-    agg_join_estimator, cv="prefit", scoring=gain_score, refit=False
-).fit(*get_X_y(baskets_val), amount=baskets_val["total_price"])
-
-results["Agg Joiner"] = get_results(
-    agg_join_tuned,
-    *get_X_y(baskets_test),
-    threshold=agg_join_tuned.best_threshold_,
-    amount=baskets_test["total_price"],
-    time_to_fit=time_to_fit,
-)
-# %%
-# Not only did we improve the gains, but this operation is also much faster than the
-# naive low effort!
-
-plot_gain_tradeoff(results)
-
-# %%
-# We see that the agg-joiner model is slightly more calibrated, with a lower (better)
-# log loss.
-
-plot_calibration_curve(results)
-
-# %%
-# Conclusion
-# ----------
-#
-# Many problems involve tables where IDs have a one-to-many relationship. To simplify
-# aggregate-then-join operations for machine learning, we can include the |AggJoiner|
-# in our pipeline.
-#
-# One known limitation of both the |AggJoiner| and |Joiner| is that the auxiliary data
-# to join is passed during the ``__init__`` method instead of the ``fit`` method, and
-# is therefore fixed once the model has been trained.
-# This limitation causes two main issues:
-#
-# 1. **Inefficient model serialization:** Since the dataset has to be pickled along with
-# the model, it can result in a massive file size on disk.
-#
-# 2. **Inflexibility with new, unseen data in a production environment:** To use new
-# auxiliary data, you would need to replace the auxiliary table in the AggJoiner that
-# was used during ``fit`` with the updated data, which is a rather hacky approach.
-#
-# These limitations will be addressed later in skrub.

From f1749a4fa55fc8bb007298a7b41ba4514f428d95 Mon Sep 17 00:00:00 2001
From: Riccardo Cappuzzo <riccardo.cappuzzo@gmail.com>
Date: Mon, 15 Jun 2026 13:29:09 +0200
Subject: [PATCH 4/4] _

---
 CHANGES.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index fbf9d0f00..a7dd2caf2 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -43,10 +43,10 @@ New Features
 
 Changes
 -------
-- Grouped Examples into three sections. :pr:`2102` by
+- Grouped Examples into subject-specific sections. :pr:`2102` by
   :user:`Maureen Githaiga <maureen-githaiga>`.
-- :meth:`choose_from` now transparently converts `outcomes` to a list when it is another type of sequence. :pr:`2100` by
-  :user:`aidbar <aidbar>`.
+- :meth:`choose_from` now transparently converts `outcomes` to a list when it is
+  another type of sequence. :pr:`2100` by :user:`aidbar <aidbar>`.
 - An unnecessary warning that was raised when passing a numpy array to the
   TableVectorizer has been removed. :pr:`1908` by
   :user:`Sandrine Henry <sandrineh>`.