From f7fdcd79186841da7fe8bc4d8a8110ef0af151cb Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 24 Feb 2026 17:45:06 +0100 Subject: [PATCH 01/74] Adding the SessionEncoder --- skrub/_session_encoder.py | 140 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 skrub/_session_encoder.py diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py new file mode 100644 index 000000000..402c4dbad --- /dev/null +++ b/skrub/_session_encoder.py @@ -0,0 +1,140 @@ +""" +The SessionEncoder is a transformer that takes as input: +- a "by" column, which identifies a user +- a "timestamp" column, which identifies the time of an event +- a "session_duration" value, which identifies the duration of a session + +It returns a dataframe with the same number of rows as the input, but with the following +columns: +- "session_id": a unique identifier for each session, which is a combination of the "by" +column and a session number +- "session_start": the timestamp of the first event in the session +- "session_end": the timestamp of the last event in the session +- "session_duration": the duration of the session, which is the difference between the +last and first timestamps in the session +""" + +from sklearn.base import BaseEstimator, TransformerMixin + +from . import _dataframe as sbd + + +class SessionEncoder(TransformerMixin, BaseEstimator): + """Encode sessions from a dataframe. + + Parameters + ---------- + by : str + The name of the column that identifies a user. This column is used to + group events into sessions. + + timestamp : str + The name of the column that identifies the time of an event. This column + is used to determine the start and end of a session. + + session_duration : str, optional + The name of the column that identifies the duration of a session. If not + provided, the duration is calculated as the difference between the last + and first timestamps in the session. + + session_gap : int, default=30 + The maximum gap (in minutes) between events in a session. If the gap + between two events exceeds this value, they are considered to be in + different sessions. + + Attributes + ---------- + all_inputs_ : list of str + All column names in the input dataframe. + """ + + def __init__(self, by, timestamp, session_duration=None, session_gap=30): + self.by = by + self.timestamp = timestamp + self.session_duration = session_duration + self.session_gap = session_gap + + def fit(self, X, y=None): + """Fit the transformer to the data. + + Parameters + ---------- + X : pandas.DataFrame or polars.DataFrame + The input dataframe. + + y : None + Ignored. + + Returns + ------- + self : SessionEncoder + The fitted transformer. + """ + self.fit_transform(X, y) + return self + + def fit_transform(self, X, y=None): + """Fit the transformer to the data and return the transformed dataframe. + + Parameters + ---------- + X : pandas.DataFrame or polars.DataFrame + The input dataframe. + + y : None + Ignored. + + Returns + ------- + pandas.DataFrame or polars.DataFrame + The transformed dataframe with session information. + """ + self.all_inputs_ = sbd.column_names(X) + # check that the required columns are present in the input dataframe + if self.by not in self.all_inputs_: + raise ValueError(f"Column '{self.by}' not found in input dataframe") + if self.timestamp not in self.all_inputs_: + raise ValueError(f"Column '{self.timestamp}' not found in input dataframe") + + # check the correctness of the values of session_gap and session_duration + if not isinstance(self.session_gap, (int, float)) or self.session_gap <= 0: + raise ValueError("session_gap must be a positive number") + + if self.session_duration is not None and not isinstance( + self.session_duration, (int, float) + ): + raise ValueError("session_duration must be a number if provided") + + # sort the input dataframe by the "by" and "timestamp" columns + X_sorted = sbd.sort(X, by=[self.by, self.timestamp]) # noqa + # mark the start of a new session by checking the difference + + # add the session id + + # compute statistics + + # wrap everything up in a dataframe and return it + + return self.transform(X) + + def _check_is_new_session(self, X): + """Check if a new session starts at each row of the dataframe. + + Parameters + ---------- + X : pandas.DataFrame or polars.DataFrame + The input dataframe. + + Returns + ------- + pandas.Series or polars.Series + A boolean series indicating whether a new session starts at each row. + """ + # check if the "by" column changes + char_diff = X[self.by].diff().fillna(0) > 0 + # check if the time difference between events exceeds the session gap + time_diff = X[self.timestamp].diff().fillna(0) > self.session_gap * 60 * 1000 + # a new session starts if either the "by" column changes or the time gap is + # exceeded + is_new_session = char_diff | time_diff + return is_new_session From 65be83adbee5785997efc5ca8b5779db7dcf4287 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 25 Feb 2026 11:51:54 +0100 Subject: [PATCH 02/74] more work --- skrub/__init__.py | 2 + skrub/_session_encoder.py | 113 ++++++++++++++++++++++++++++++-------- 2 files changed, 91 insertions(+), 24 deletions(-) diff --git a/skrub/__init__.py b/skrub/__init__.py index 97be57729..880308217 100644 --- a/skrub/__init__.py +++ b/skrub/__init__.py @@ -40,6 +40,7 @@ from ._multi_agg_joiner import MultiAggJoiner from ._reporting import TableReport, patch_display, unpatch_display from ._select_cols import Drop, DropCols, SelectCols +from ._session_encoder import SessionEncoder from ._similarity_encoder import SimilarityEncoder from ._squashing_scaler import SquashingScaler from ._string_encoder import StringEncoder @@ -107,4 +108,5 @@ "ApplyToSubFrame", "ApplyToCols", "ToFloat", + "SessionEncoder", ] diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 402c4dbad..654285a42 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -14,9 +14,87 @@ last and first timestamps in the session """ +import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin from . import _dataframe as sbd +from ._dispatch import dispatch + + +@dispatch +def _check_is_new_session(X, by, timestamp, session_gap): + # Avoid circular import + from ._dispatch import raise_dispatch_unregistered_type + + raise_dispatch_unregistered_type(X, kind="Dataframe") + + +@_check_is_new_session.specialize("pandas") +def _check_is_new_session_pandas(X, by, timestamp, session_gap): + # check if the "by" column changes + char_diff = X[by].diff().fillna(0) > 0 + # check if the time difference between events exceeds the session gap + time_diff = X[timestamp].astype(int).diff().fillna(0) // 10**6 > session_gap * 60 + # a new session starts if either the "by" column changes or the time gap is + # exceeded + is_new_session = char_diff | time_diff + return is_new_session + + +@_check_is_new_session.specialize("polars") +def _check_is_new_session_polars(X, by, timestamp, session_gap): + # check if the "by" column changes + char_diff = X[by].diff().fill_null(0) > 0 + # check if the time difference between events exceeds the session gap + time_diff = X[timestamp].diff().fill_null(0) > session_gap * 60 * 1000 + # a new session starts if either the "by" column changes or the time gap is + # exceeded + is_new_session = char_diff | time_diff + return is_new_session + + +@dispatch +def _factorize_column(X, column_name): + # Avoid circular import + from ._dispatch import raise_dispatch_unregistered_type + + raise_dispatch_unregistered_type(X, kind="Dataframe") + + +@_factorize_column.specialize("pandas") +def _factorize_column_pandas(X, column_name): + codes, _ = pd.factorize(X[column_name]) + return codes + + +@_factorize_column.specialize("polars") +def _factorize_column_polars(X, column_name): + import polars as pl + + # TODO: update this according to the proper polars API + return X[column_name].cast(pl.Categorical).to_physical() + + +@dispatch +def _add_session_id(X, is_new_session): + # Avoid circular import + from ._dispatch import raise_dispatch_unregistered_type + + raise_dispatch_unregistered_type(X, kind="Dataframe") + + +@_add_session_id.specialize("pandas") +def _add_session_id_pandas(X, is_new_session): + # Compute cumulative sum of is_new_session to create session IDs + X_copy = X.copy() + X_copy["session_id"] = is_new_session.cumsum() + return X_copy + + +@_add_session_id.specialize("polars") +def _add_session_id_polars(X, is_new_session): + # Add session_id by computing cumulative sum of is_new_session + return X.with_columns(is_new_session.cum_sum().alias("session_id")) class SessionEncoder(TransformerMixin, BaseEstimator): @@ -107,34 +185,21 @@ def fit_transform(self, X, y=None): # sort the input dataframe by the "by" and "timestamp" columns X_sorted = sbd.sort(X, by=[self.by, self.timestamp]) # noqa - # mark the start of a new session by checking the difference + # convert by column to string if it's not already, to ensure + # that the diff operation works correctly + X_factorized = sbd.with_columns( + X_sorted, **{self.by: _factorize_column(X_sorted, self.by)} + ) + # mark the start of a new session by checking the difference + is_new_session = _check_is_new_session( + X_factorized, self.by, self.timestamp, self.session_gap + ) # add the session id + X_with_session_id = _add_session_id(X_factorized, is_new_session) # compute statistics # wrap everything up in a dataframe and return it - return self.transform(X) - - def _check_is_new_session(self, X): - """Check if a new session starts at each row of the dataframe. - - Parameters - ---------- - X : pandas.DataFrame or polars.DataFrame - The input dataframe. - - Returns - ------- - pandas.Series or polars.Series - A boolean series indicating whether a new session starts at each row. - """ - # check if the "by" column changes - char_diff = X[self.by].diff().fillna(0) > 0 - # check if the time difference between events exceeds the session gap - time_diff = X[self.timestamp].diff().fillna(0) > self.session_gap * 60 * 1000 - # a new session starts if either the "by" column changes or the time gap is - # exceeded - is_new_session = char_diff | time_diff - return is_new_session + return X_with_session_id From a6caeb7d654ce08aeea064cb168dd996594a0090 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 25 Feb 2026 11:52:10 +0100 Subject: [PATCH 03/74] adding tests --- skrub/tests/test_session_encoder.py | 279 ++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100644 skrub/tests/test_session_encoder.py diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py new file mode 100644 index 000000000..abf194ae6 --- /dev/null +++ b/skrub/tests/test_session_encoder.py @@ -0,0 +1,279 @@ +import datetime + +import pytest + +from skrub import SessionEncoder +from skrub import _dataframe as sbd + + +def test_session_encoder_basic(df_module): + """Test basic sessionization with numeric user IDs.""" + # Create sample data with clear sessions + timestamps = [] + user_ids = [] + values = [] + + base_time = datetime.datetime(2024, 1, 1) + + # Create 3 sessions with events close together (2 min apart), + # separated by large gaps (10 days) + for session in range(3): + session_start = base_time + datetime.timedelta(days=session * 10) + for event in range(5): + timestamps.append(session_start + datetime.timedelta(minutes=event * 2)) + user_ids.append(101) + values.append(float(session * 5 + event)) + + df = df_module.make_dataframe( + {"timestamp": timestamps, "user_id": user_ids, "value": values} + ) + + # Apply SessionEncoder with 20-minute gap threshold + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=20) + result = se.fit_transform(df) + + # Check that we have 3 sessions + session_ids = sbd.to_list(sbd.col(result, "session_id")) + unique_sessions = set(session_ids) + assert len(unique_sessions) == 3, f"Expected 3 sessions, got {len(unique_sessions)}" + + # Check that events within a session have the same session_id + # Each session has 5 events, so we should have patterns like: + # [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] + assert session_ids[0] == session_ids[4] + assert session_ids[5] == session_ids[9] + assert session_ids[10] == session_ids[14] + + # Check that different sessions have different IDs + assert session_ids[0] != session_ids[5] + assert session_ids[5] != session_ids[10] + + +def test_session_encoder_alphanumeric_users(df_module): + """Test sessionization with alphanumeric user IDs.""" + timestamps = [] + user_ids = [] + + base_time = datetime.datetime(2024, 1, 1) + + # User A: 2 sessions + for session in range(2): + session_start = base_time + datetime.timedelta(hours=session * 2) + for event in range(3): + timestamps.append(session_start + datetime.timedelta(minutes=event * 5)) + user_ids.append("USER_A") + + # User B: 1 session + session_start = base_time + datetime.timedelta(days=1) + for event in range(3): + timestamps.append(session_start + datetime.timedelta(minutes=event * 5)) + user_ids.append("USER_B") + + df = df_module.make_dataframe( + { + "timestamp": timestamps, + "user_id": user_ids, + } + ) + + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + result = se.fit_transform(df) + + session_ids = sbd.to_list(sbd.col(result, "session_id")) + + # Check User A has 2 sessions + # Sessions should change when user changes or time gap exceeds threshold + # First 3 events: USER_A session 1 + # Next 3 events: USER_A session 2 (2 hours gap > 30 min) + # Last 3 events: USER_B session 3 (user change) + assert len(set(session_ids)) == 3 + + # Check that user change triggers new session + user_a_sessions = set([session_ids[i] for i in range(6)]) + user_b_sessions = set([session_ids[i] for i in range(6, 9)]) + assert len(user_a_sessions.intersection(user_b_sessions)) == 0 + + +def test_session_encoder_multiple_users(df_module): + """Test sessionization with multiple users interleaved.""" + timestamps = [] + user_ids = [] + + base_time = datetime.datetime(2024, 1, 1) + + # Create events for two users, alternating + for i in range(10): + timestamps.append(base_time + datetime.timedelta(minutes=i)) + user_ids.append(101 if i % 2 == 0 else 102) + + df = df_module.make_dataframe( + { + "timestamp": timestamps, + "user_id": user_ids, + } + ) + + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + result = se.fit_transform(df) + + # After sorting by user_id and timestamp, each user should have 1 session + # since all their events are within 30 minutes + session_ids = sbd.to_list(sbd.col(result, "session_id")) + + # The encoder sorts by user_id then timestamp, so events are grouped by user + # Check that there are exactly 2 sessions (one per user) + assert len(set(session_ids)) == 2 + + +def test_session_encoder_time_gap_threshold(df_module): + """Test that session_gap parameter correctly determines sessionization.""" + timestamps = [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 15), # 15 min gap + datetime.datetime(2024, 1, 1, 10, 50), # 35 min gap + datetime.datetime(2024, 1, 1, 11, 0), # 10 min gap + ] + user_ids = [101, 101, 101, 101] + + df = df_module.make_dataframe( + { + "timestamp": timestamps, + "user_id": user_ids, + } + ) + + # With 20-minute gap: should create 2 sessions (split at 35-min gap) + se_20 = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=20) + result_20 = se_20.fit_transform(df) + session_ids_20 = sbd.to_list(sbd.col(result_20, "session_id")) + assert len(set(session_ids_20)) == 2 + + # With 40-minute gap: should create 1 session (all gaps < 40 min) + se_40 = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=40) + result_40 = se_40.fit_transform(df) + session_ids_40 = sbd.to_list(sbd.col(result_40, "session_id")) + assert len(set(session_ids_40)) == 1 + + +def test_session_encoder_single_event(df_module): + """Test sessionization with single event per user.""" + df = df_module.make_dataframe( + { + "timestamp": [datetime.datetime(2024, 1, 1, 10, 0)], + "user_id": [101], + } + ) + + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + result = se.fit_transform(df) + + session_ids = sbd.to_list(sbd.col(result, "session_id")) + assert len(session_ids) == 1 + # Single event should create one session + assert session_ids[0] in [0, 1] # Could be 0 or 1 depending on implementation + + +def test_session_encoder_empty_dataframe(df_module): + """Test sessionization with empty dataframe.""" + df = df_module.make_dataframe( + { + "timestamp": [], + "user_id": [], + } + ) + + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + result = se.fit_transform(df) + + assert sbd.shape(result)[0] == 0 + assert "session_id" in sbd.column_names(result) + + +def test_session_encoder_missing_column_error(df_module): + """Test that missing columns raise appropriate errors.""" + df = df_module.make_dataframe( + { + "timestamp": [datetime.datetime(2024, 1, 1)], + "wrong_column": [101], + } + ) + + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + with pytest.raises(ValueError, match="Column 'user_id' not found"): + se.fit_transform(df) + + df2 = df_module.make_dataframe( + { + "wrong_timestamp": [datetime.datetime(2024, 1, 1)], + "user_id": [101], + } + ) + + se2 = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + with pytest.raises(ValueError, match="Column 'timestamp' not found"): + se2.fit_transform(df2) + + +def test_session_encoder_invalid_parameters(df_module): + """Test that invalid parameters raise appropriate errors.""" + df = df_module.make_dataframe( + { + "timestamp": [datetime.datetime(2024, 1, 1)], + "user_id": [101], + } + ) + + # Test negative session_gap + se_negative = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=-10) + with pytest.raises(ValueError, match="session_gap must be a positive number"): + se_negative.fit_transform(df) + + # Test zero session_gap + se_zero = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=0) + with pytest.raises(ValueError, match="session_gap must be a positive number"): + se_zero.fit_transform(df) + + +def test_session_encoder_preserves_columns(df_module): + """Test that original columns are preserved in output.""" + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 5), + ], + "user_id": [101, 101], + "extra_col": [1.5, 2.5], + } + ) + + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + result = se.fit_transform(df) + + result_cols = sbd.column_names(result) + assert "timestamp" in result_cols + assert "user_id" in result_cols + assert "extra_col" in result_cols + assert "session_id" in result_cols + + +def test_session_encoder_fit_and_transform(df_module): + """Test that fit() and transform() work separately.""" + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 5), + ], + "user_id": [101, 101], + } + ) + + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + + # Test fit returns self + se_fitted = se.fit(df) + assert se_fitted is se + + # Test that all_inputs_ is set after fit + assert hasattr(se, "all_inputs_") From cc14c5521d071a1440b7cc6a4b07c3d106adddcd Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 25 Feb 2026 11:56:44 +0100 Subject: [PATCH 04/74] changelog --- CHANGES.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 5ec7c4122..cfe3fbcd2 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -19,6 +19,10 @@ New Features faster (the overhead it removes typically becomes noticeable only in DataOps with 50-100 nodes or more). Moreover, the evaluation of large DataOps has also become faster. :pr:`1890` by :user:`Jérôme Dockès `. +- The :class:`SessionEncoder` is now available. This encoder takes a dataframe with + a timestamp column and a grouping column (e.g., user ID) and computes sessions + for each value in the grouping column. Additional statistics can also be added. + :pr:`1930` by :user:`Riccardo Cappuzzo `. Changes ------- From dba476f7d2238840b54648f00e35630083212532 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 25 Feb 2026 14:59:01 +0100 Subject: [PATCH 05/74] adding drop cols, various improvements --- skrub/_dataframe/_common.py | 16 +++ skrub/_session_encoder.py | 29 ++++-- skrub/tests/test_session_encoder.py | 146 ++++++++++++++++------------ 3 files changed, 121 insertions(+), 70 deletions(-) diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py index 05f7a3c61..495fdb640 100644 --- a/skrub/_dataframe/_common.py +++ b/skrub/_dataframe/_common.py @@ -54,6 +54,7 @@ "reset_index", "copy_index", "index", + "drop_columns", # # Inspecting dtypes and casting # @@ -632,6 +633,21 @@ def _index_pandas(obj): return obj.index +@dispatch +def drop_columns(df, columns): + raise_dispatch_unregistered_type(df, kind="DataFrame") + + +@drop_columns.specialize("pandas", argument_type="DataFrame") +def _drop_columns_pandas(df, columns): + return df.drop(columns=columns) + + +@drop_columns.specialize("polars", argument_type="DataFrame") +def _drop_columns_polars(df, columns): + return df.drop(columns) + + # # Inspecting dtypes and casting # ============================= diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 654285a42..0cb8cf002 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -34,7 +34,9 @@ def _check_is_new_session_pandas(X, by, timestamp, session_gap): # check if the "by" column changes char_diff = X[by].diff().fillna(0) > 0 # check if the time difference between events exceeds the session gap - time_diff = X[timestamp].astype(int).diff().fillna(0) // 10**6 > session_gap * 60 + time_diff = ( + X[timestamp].astype(int).diff().fillna(0) // 10**3 > session_gap * 60 * 1000 + ) # a new session starts if either the "by" column changes or the time gap is # exceeded is_new_session = char_diff | time_diff @@ -46,7 +48,9 @@ def _check_is_new_session_polars(X, by, timestamp, session_gap): # check if the "by" column changes char_diff = X[by].diff().fill_null(0) > 0 # check if the time difference between events exceeds the session gap - time_diff = X[timestamp].diff().fill_null(0) > session_gap * 60 * 1000 + time_diff = ( + X[timestamp].dt.epoch("ms").diff().fill_null(0) > session_gap * 60 * 1000 + ) # a new session starts if either the "by" column changes or the time gap is # exceeded is_new_session = char_diff | time_diff @@ -188,18 +192,25 @@ def fit_transform(self, X, y=None): # convert by column to string if it's not already, to ensure # that the diff operation works correctly - X_factorized = sbd.with_columns( - X_sorted, **{self.by: _factorize_column(X_sorted, self.by)} - ) + if not sbd.is_numeric(X_sorted[self.by]): + factorized_by = f"{self.by}_factorized" + X_factorized = sbd.with_columns( + X_sorted, **{factorized_by: _factorize_column(X_sorted, self.by)} + ) + else: + factorized_by = self.by + X_factorized = X_sorted # mark the start of a new session by checking the difference is_new_session = _check_is_new_session( - X_factorized, self.by, self.timestamp, self.session_gap + X_factorized, factorized_by, self.timestamp, self.session_gap ) # add the session id X_with_session_id = _add_session_id(X_factorized, is_new_session) - # compute statistics - - # wrap everything up in a dataframe and return it + # drop the factorized "by" column if the original "by" column was not numeric + if factorized_by != self.by: + X_with_session_id = sbd.drop_columns( + X_with_session_id, columns=[factorized_by] + ) return X_with_session_id diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index abf194ae6..a0ab5922d 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -6,92 +6,116 @@ from skrub import _dataframe as sbd -def test_session_encoder_basic(df_module): - """Test basic sessionization with numeric user IDs.""" - # Create sample data with clear sessions +@pytest.fixture +def example_session_data(df_module): + """Create example session data with multiple users and sessions.""" timestamps = [] user_ids = [] - values = [] + usernames = [] base_time = datetime.datetime(2024, 1, 1) - # Create 3 sessions with events close together (2 min apart), - # separated by large gaps (10 days) + # User 101, alice: 3 sessions with 5 events each, 10 days apart for session in range(3): session_start = base_time + datetime.timedelta(days=session * 10) for event in range(5): timestamps.append(session_start + datetime.timedelta(minutes=event * 2)) user_ids.append(101) - values.append(float(session * 5 + event)) + usernames.append("alice") - df = df_module.make_dataframe( - {"timestamp": timestamps, "user_id": user_ids, "value": values} - ) - - # Apply SessionEncoder with 20-minute gap threshold - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=20) - result = se.fit_transform(df) - - # Check that we have 3 sessions - session_ids = sbd.to_list(sbd.col(result, "session_id")) - unique_sessions = set(session_ids) - assert len(unique_sessions) == 3, f"Expected 3 sessions, got {len(unique_sessions)}" - - # Check that events within a session have the same session_id - # Each session has 5 events, so we should have patterns like: - # [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2] - assert session_ids[0] == session_ids[4] - assert session_ids[5] == session_ids[9] - assert session_ids[10] == session_ids[14] - - # Check that different sessions have different IDs - assert session_ids[0] != session_ids[5] - assert session_ids[5] != session_ids[10] - - -def test_session_encoder_alphanumeric_users(df_module): - """Test sessionization with alphanumeric user IDs.""" - timestamps = [] - user_ids = [] - - base_time = datetime.datetime(2024, 1, 1) - - # User A: 2 sessions + # User 102, bob: 2 sessions with 3 events each, 2 hours apart for session in range(2): - session_start = base_time + datetime.timedelta(hours=session * 2) + session_start = base_time + datetime.timedelta(days=35, hours=session * 2) for event in range(3): timestamps.append(session_start + datetime.timedelta(minutes=event * 5)) - user_ids.append("USER_A") + user_ids.append(102) + usernames.append("bob") - # User B: 1 session - session_start = base_time + datetime.timedelta(days=1) - for event in range(3): - timestamps.append(session_start + datetime.timedelta(minutes=event * 5)) - user_ids.append("USER_B") + # User 103, charlie: 1 session with 4 events + session_start = base_time + datetime.timedelta(days=40) + for event in range(4): + timestamps.append(session_start + datetime.timedelta(minutes=event * 3)) + user_ids.append(103) + usernames.append("charlie") - df = df_module.make_dataframe( + return df_module.make_dataframe( { "timestamp": timestamps, "user_id": user_ids, + "username": usernames, } ) - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) - result = se.fit_transform(df) +@pytest.mark.parametrize( + "by_column,expected_sessions,group_key_to_sessions", + [ + ("user_id", 6, {101: 3, 102: 2, 103: 1}), + ("username", 6, {"alice": 3, "bob": 2, "charlie": 1}), + ], +) +def test_session_encoder_basic( + example_session_data, by_column, expected_sessions, group_key_to_sessions +): + """Test basic sessionization grouping by user_id or username.""" + # Apply SessionEncoder grouping by the specified column + se = SessionEncoder(by=by_column, timestamp="timestamp", session_gap=30) + result = se.fit_transform(example_session_data) + + # Check that we have the expected total number of sessions session_ids = sbd.to_list(sbd.col(result, "session_id")) + unique_sessions = set(session_ids) + assert len(unique_sessions) == expected_sessions + + # Get the appropriate column data based on what we're grouping by + if by_column == "user_id": + group_values = sbd.to_list(sbd.col(result, "user_id")) + else: # by_column == "username" + group_values = sbd.to_list(sbd.col(result, "username")) + + counted_sessions = {} + for group_key, session_id in zip(group_values, session_ids): + if group_key not in counted_sessions: + counted_sessions[group_key] = set() + counted_sessions[group_key].add(session_id) + for group_key, sessions in counted_sessions.items(): + assert len(sessions) == group_key_to_sessions[group_key] + + +@pytest.mark.parametrize( + "by_column", + ["user_id", "username"], +) +def test_session_encoder_different_users_different_sessions( + example_session_data, by_column +): + """Test that different users/groups have different session IDs.""" + # Apply SessionEncoder + se = SessionEncoder(by=by_column, timestamp="timestamp", session_gap=30) + result = se.fit_transform(example_session_data) - # Check User A has 2 sessions - # Sessions should change when user changes or time gap exceeds threshold - # First 3 events: USER_A session 1 - # Next 3 events: USER_A session 2 (2 hours gap > 30 min) - # Last 3 events: USER_B session 3 (user change) - assert len(set(session_ids)) == 3 - - # Check that user change triggers new session - user_a_sessions = set([session_ids[i] for i in range(6)]) - user_b_sessions = set([session_ids[i] for i in range(6, 9)]) - assert len(user_a_sessions.intersection(user_b_sessions)) == 0 + session_ids = sbd.to_list(sbd.col(result, "session_id")) + result_user_ids = sbd.to_list(sbd.col(result, "user_id")) + result_usernames = sbd.to_list(sbd.col(result, "username")) + + # Get the appropriate column data based on what we're grouping by + if by_column == "user_id": + group_values = result_user_ids + group_keys = [101, 102, 103] + else: # by_column == "username" + group_values = result_usernames + group_keys = ["alice", "bob", "charlie"] + + # Verify different groups don't share session IDs + for i, key1 in enumerate(group_keys): + for key2 in group_keys[i + 1 :]: + indices1 = [idx for idx, v in enumerate(group_values) if v == key1] + indices2 = [idx for idx, v in enumerate(group_values) if v == key2] + sessions1 = set([session_ids[idx] for idx in indices1]) + sessions2 = set([session_ids[idx] for idx in indices2]) + assert len(sessions1.intersection(sessions2)) == 0, ( + f"Groups {key1} and {key2} should not share session IDs" + ) def test_session_encoder_multiple_users(df_module): From d46090db34e79acc1d9e9c0a134cf904fb3a785c Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 25 Feb 2026 15:05:12 +0100 Subject: [PATCH 06/74] adding a test --- skrub/_dataframe/tests/test_common.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/skrub/_dataframe/tests/test_common.py b/skrub/_dataframe/tests/test_common.py index 5ee0707d2..f06fd382c 100644 --- a/skrub/_dataframe/tests/test_common.py +++ b/skrub/_dataframe/tests/test_common.py @@ -447,6 +447,15 @@ def test_index(df_module): assert ns.index(col) is None +def test_drop_columns(df_module): + df = df_module.example_dataframe + col_names = ns.column_names(df) + col_to_drop = col_names[0] + df_dropped = ns.drop_columns(df, [col_to_drop]) + assert col_to_drop not in ns.column_names(df_dropped) + assert len(ns.column_names(df_dropped)) == len(col_names) - 1 + + # # Inspecting dtypes and casting # ============================= From 28b958ab8dd6b3c25aae40fb9b9792c9a76103b7 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 25 Feb 2026 15:41:56 +0100 Subject: [PATCH 07/74] simplifying tests and code --- skrub/_session_encoder.py | 35 ++++++------------- skrub/tests/test_session_encoder.py | 54 ++++++++++++++++------------- 2 files changed, 40 insertions(+), 49 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 0cb8cf002..22ab0831a 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -2,18 +2,16 @@ The SessionEncoder is a transformer that takes as input: - a "by" column, which identifies a user - a "timestamp" column, which identifies the time of an event -- a "session_duration" value, which identifies the duration of a session - -It returns a dataframe with the same number of rows as the input, but with the following -columns: -- "session_id": a unique identifier for each session, which is a combination of the "by" -column and a session number -- "session_start": the timestamp of the first event in the session -- "session_end": the timestamp of the last event in the session -- "session_duration": the duration of the session, which is the difference between the -last and first timestamps in the session +- a "session_gap" value, which identifies the maximum allowed gap between events +in a session + +It returns a dataframe with the same number of rows as the input, but with the +column "session_id": a unique identifier for each session, which is a combination +of the "by" column and a session number """ +import numbers + import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin @@ -114,11 +112,6 @@ class SessionEncoder(TransformerMixin, BaseEstimator): The name of the column that identifies the time of an event. This column is used to determine the start and end of a session. - session_duration : str, optional - The name of the column that identifies the duration of a session. If not - provided, the duration is calculated as the difference between the last - and first timestamps in the session. - session_gap : int, default=30 The maximum gap (in minutes) between events in a session. If the gap between two events exceeds this value, they are considered to be in @@ -130,10 +123,9 @@ class SessionEncoder(TransformerMixin, BaseEstimator): All column names in the input dataframe. """ - def __init__(self, by, timestamp, session_duration=None, session_gap=30): + def __init__(self, by, timestamp, session_gap=30): self.by = by self.timestamp = timestamp - self.session_duration = session_duration self.session_gap = session_gap def fit(self, X, y=None): @@ -178,15 +170,10 @@ def fit_transform(self, X, y=None): if self.timestamp not in self.all_inputs_: raise ValueError(f"Column '{self.timestamp}' not found in input dataframe") - # check the correctness of the values of session_gap and session_duration - if not isinstance(self.session_gap, (int, float)) or self.session_gap <= 0: + # check the correctness of the values of session_gap + if not isinstance(self.session_gap, numbers.Number) or self.session_gap <= 0: raise ValueError("session_gap must be a positive number") - if self.session_duration is not None and not isinstance( - self.session_duration, (int, float) - ): - raise ValueError("session_duration must be a number if provided") - # sort the input dataframe by the "by" and "timestamp" columns X_sorted = sbd.sort(X, by=[self.by, self.timestamp]) # noqa diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index a0ab5922d..19e99efad 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -67,11 +67,10 @@ def test_session_encoder_basic( unique_sessions = set(session_ids) assert len(unique_sessions) == expected_sessions - # Get the appropriate column data based on what we're grouping by - if by_column == "user_id": - group_values = sbd.to_list(sbd.col(result, "user_id")) - else: # by_column == "username" - group_values = sbd.to_list(sbd.col(result, "username")) + # content of the "session_id" column after sessionization + session_ids = sbd.to_list(sbd.col(result, "session_id")) + # content of the "by" column (user_id or username) + group_values = sbd.to_list(sbd.col(result, by_column)) counted_sessions = {} for group_key, session_id in zip(group_values, session_ids): @@ -83,39 +82,37 @@ def test_session_encoder_basic( @pytest.mark.parametrize( - "by_column", - ["user_id", "username"], + "by_column,group_keys", + [ + ("user_id", [101, 102, 103]), + ("username", ["alice", "bob", "charlie"]), + ], ) def test_session_encoder_different_users_different_sessions( - example_session_data, by_column + example_session_data, by_column, group_keys ): """Test that different users/groups have different session IDs.""" # Apply SessionEncoder se = SessionEncoder(by=by_column, timestamp="timestamp", session_gap=30) result = se.fit_transform(example_session_data) + # content of the "session_id" column after sessionization session_ids = sbd.to_list(sbd.col(result, "session_id")) - result_user_ids = sbd.to_list(sbd.col(result, "user_id")) - result_usernames = sbd.to_list(sbd.col(result, "username")) - - # Get the appropriate column data based on what we're grouping by - if by_column == "user_id": - group_values = result_user_ids - group_keys = [101, 102, 103] - else: # by_column == "username" - group_values = result_usernames - group_keys = ["alice", "bob", "charlie"] + # content of the "by" column (user_id or username) + group_values = sbd.to_list(sbd.col(result, by_column)) # Verify different groups don't share session IDs for i, key1 in enumerate(group_keys): for key2 in group_keys[i + 1 :]: + # find the indices of events for each group key (user id or username) indices1 = [idx for idx, v in enumerate(group_values) if v == key1] indices2 = [idx for idx, v in enumerate(group_values) if v == key2] - sessions1 = set([session_ids[idx] for idx in indices1]) - sessions2 = set([session_ids[idx] for idx in indices2]) - assert len(sessions1.intersection(sessions2)) == 0, ( - f"Groups {key1} and {key2} should not share session IDs" - ) + # find the unique session IDs for each group key (each user) + sessions1 = {session_ids[idx] for idx in indices1} + sessions2 = {session_ids[idx] for idx in indices2} + + # check that there are no shared session IDs between different users/groups + assert len(sessions1.intersection(sessions2)) == 0 def test_session_encoder_multiple_users(df_module): @@ -142,7 +139,7 @@ def test_session_encoder_multiple_users(df_module): # After sorting by user_id and timestamp, each user should have 1 session # since all their events are within 30 minutes - session_ids = sbd.to_list(sbd.col(result, "session_id")) + session_ids = sbd.col(result, "session_id") # The encoder sorts by user_id then timestamp, so events are grouped by user # Check that there are exactly 2 sessions (one per user) @@ -194,7 +191,7 @@ def test_session_encoder_single_event(df_module): session_ids = sbd.to_list(sbd.col(result, "session_id")) assert len(session_ids) == 1 # Single event should create one session - assert session_ids[0] in [0, 1] # Could be 0 or 1 depending on implementation + assert session_ids[0] == 0 def test_session_encoder_empty_dataframe(df_module): @@ -257,6 +254,13 @@ def test_session_encoder_invalid_parameters(df_module): with pytest.raises(ValueError, match="session_gap must be a positive number"): se_zero.fit_transform(df) + # Test non-numeric session_gap + se_non_numeric = SessionEncoder( + by="user_id", timestamp="timestamp", session_gap="thirty" + ) + with pytest.raises(ValueError, match="session_gap must be a positive number"): + se_non_numeric.fit_transform(df) + def test_session_encoder_preserves_columns(df_module): """Test that original columns are preserved in output.""" From ac7389df35f9d9b65f9ce0f2b74e6c0f7f3fa3b7 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 25 Feb 2026 15:49:29 +0100 Subject: [PATCH 08/74] docstrings --- skrub/_session_encoder.py | 66 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 22ab0831a..b52c3c16e 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -14,6 +14,7 @@ import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted from . import _dataframe as sbd from ._dispatch import dispatch @@ -102,6 +103,11 @@ def _add_session_id_polars(X, is_new_session): class SessionEncoder(TransformerMixin, BaseEstimator): """Encode sessions from a dataframe. + A session is defined as a sequence of events from the same user (specified by + the `by` column) where consecutive events are separated by at most `session_gap` + minutes. When the time gap between consecutive events exceeds `session_gap`, or + when the user changes, a new session begins. + Parameters ---------- by : str @@ -121,6 +127,50 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ---------- all_inputs_ : list of str All column names in the input dataframe. + + Examples + -------- + >>> import pandas as pd + >>> from datetime import datetime, timedelta + >>> encoder = SessionEncoder(by='user_id', timestamp='timestamp', session_gap=30) + + >>> # Create a sample dataframe with events from different users + >>> data = { + ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], + ... 'timestamp': [ + ... pd.Timestamp('2024-01-01 10:00:00'), + ... pd.Timestamp('2024-01-01 10:05:00'), # 5 min later, same session + ... pd.Timestamp('2024-01-01 11:00:00'), # 55 min later, new session + ... pd.Timestamp('2024-01-01 10:00:00'), # Different user + ... pd.Timestamp('2024-01-01 10:20:00'), # 20 min later, same session + ... ], + ... 'action': ['login', 'view', 'logout', 'login', 'purchase'] + ... } + >>> df = pd.DataFrame(data) + >>> df + user_id timestamp action + 0 alice 2024-01-01 10:00:00 login + 1 alice 2024-01-01 10:05:00 view + 2 alice 2024-01-01 11:00:00 logout + 3 bob 2024-01-01 10:00:00 login + 4 bob 2024-01-01 10:20:00 purchase + + >>> result = encoder.fit_transform(df) + >>> result + user_id timestamp action session_id + 0 alice 2024-01-01 10:00:00 login 0 + 1 alice 2024-01-01 10:05:00 view 0 + 2 alice 2024-01-01 11:00:00 logout 1 + 3 bob 2024-01-01 10:00:00 login 2 + 4 bob 2024-01-01 10:20:00 purchase 2 + + In this example: + - Alice's first two events (10:00 and 10:05) are 5 minutes apart, so they form + session 1. + - Alice's third event (11:00) is 55 minutes after the previous one, exceeding + the 30-minute gap, so it forms a new session (session 2). + - Bob's events form session 3 (different user), with both events within the + 30-minute window. """ def __init__(self, by, timestamp, session_gap=30): @@ -201,3 +251,19 @@ def fit_transform(self, X, y=None): ) return X_with_session_id + + def transform(self, X): + """Transform the data by encoding sessions. + + Parameters + ---------- + X : pandas.DataFrame or polars.DataFrame + The input dataframe. + + Returns + ------- + pandas.DataFrame or polars.DataFrame + The transformed dataframe with session information. + """ + check_is_fitted() + return self.fit_transform(X) From 8a61ea07187d9fa764c45f87f46778eb5f29c83b Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 11:11:32 +0100 Subject: [PATCH 09/74] adding support for multiple by columns --- skrub/_session_encoder.py | 130 +++++++++++++++++++++------- skrub/tests/test_session_encoder.py | 95 ++++++++++++++++++++ 2 files changed, 192 insertions(+), 33 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index b52c3c16e..53243e4f6 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -1,13 +1,13 @@ """ The SessionEncoder is a transformer that takes as input: -- a "by" column, which identifies a user - a "timestamp" column, which identifies the time of an event +- a "by" column or list of columns, which identifies a user - a "session_gap" value, which identifies the maximum allowed gap between events in a session It returns a dataframe with the same number of rows as the input, but with the column "session_id": a unique identifier for each session, which is a combination -of the "by" column and a session number +of the "by" column(s) and a session number """ import numbers @@ -18,6 +18,12 @@ from . import _dataframe as sbd from ._dispatch import dispatch +from ._utils import random_string + +try: + import polars as pl +except ImportError: + pass @dispatch @@ -31,7 +37,7 @@ def _check_is_new_session(X, by, timestamp, session_gap): @_check_is_new_session.specialize("pandas") def _check_is_new_session_pandas(X, by, timestamp, session_gap): # check if the "by" column changes - char_diff = X[by].diff().fillna(0) > 0 + char_diff = (X[by].diff().fillna(0) > 0).any(axis=1) # check if the time difference between events exceeds the session gap time_diff = ( X[timestamp].astype(int).diff().fillna(0) // 10**3 > session_gap * 60 * 1000 @@ -45,7 +51,9 @@ def _check_is_new_session_pandas(X, by, timestamp, session_gap): @_check_is_new_session.specialize("polars") def _check_is_new_session_polars(X, by, timestamp, session_gap): # check if the "by" column changes - char_diff = X[by].diff().fill_null(0) > 0 + char_diff = X.select( + pl.any_horizontal(pl.col(by).diff().fill_null(0) > 0) + ).to_series() # check if the time difference between events exceeds the session gap time_diff = ( X[timestamp].dt.epoch("ms").diff().fill_null(0) > session_gap * 60 * 1000 @@ -89,9 +97,8 @@ def _add_session_id(X, is_new_session): @_add_session_id.specialize("pandas") def _add_session_id_pandas(X, is_new_session): # Compute cumulative sum of is_new_session to create session IDs - X_copy = X.copy() - X_copy["session_id"] = is_new_session.cumsum() - return X_copy + X["session_id"] = is_new_session.cumsum() + return X @_add_session_id.specialize("polars") @@ -103,21 +110,23 @@ def _add_session_id_polars(X, is_new_session): class SessionEncoder(TransformerMixin, BaseEstimator): """Encode sessions from a dataframe. - A session is defined as a sequence of events from the same user (specified by - the `by` column) where consecutive events are separated by at most `session_gap` - minutes. When the time gap between consecutive events exceeds `session_gap`, or + A session is defined as a sequence of events where consecutive events are separated + by at most `session_gap` minutes. Additionally, it is possible to provide a column + or list of columns that identifies the user (specified by the `by` column). + When the time gap between consecutive events exceeds `session_gap`, or when the user changes, a new session begins. Parameters ---------- - by : str - The name of the column that identifies a user. This column is used to - group events into sessions. - timestamp : str The name of the column that identifies the time of an event. This column is used to determine the start and end of a session. + by : optional[str, list[str]], default=None + The name of the column, or list of columns, that identifies a user. This + parameter is used to group events into sessions. If not provided, all + events are considered to belong to the same user. + session_gap : int, default=30 The maximum gap (in minutes) between events in a session. If the gap between two events exceeds this value, they are considered to be in @@ -171,11 +180,53 @@ class SessionEncoder(TransformerMixin, BaseEstimator): the 30-minute gap, so it forms a new session (session 2). - Bob's events form session 3 (different user), with both events within the 30-minute window. + + You can also identify users by multiple columns. For instance, the same user + on different devices should have separate sessions: + + >>> encoder_multi = SessionEncoder( + ... by=['user_id', 'device_id'], + ... timestamp='timestamp', + ... session_gap=30 + ... ) + + >>> # Create a sample dataframe where user_id + device_id identifies a user + >>> data_multi = { + ... 'user_id': [1, 1, 1, 1, 2, 2], + ... 'device_id': ['mobile', 'mobile', 'desktop', 'desktop', 'mobile', 'mobile'], + ... 'timestamp': [ + ... pd.Timestamp('2024-01-01 10:00:00'), + ... pd.Timestamp('2024-01-01 10:10:00'), # 10 min later, same session + ... pd.Timestamp('2024-01-01 10:05:00'), # Different device (sorted), + ... # different session + ... pd.Timestamp('2024-01-01 10:20:00'), # 15 min later, same session + ... pd.Timestamp('2024-01-01 10:00:00'), # Different user + ... pd.Timestamp('2024-01-01 10:15:00'), # 15 min later, same session + ... ], + ... 'action': ['view', 'purchase', 'view', 'checkout', 'login', 'view'] + ... } + >>> df_multi = pd.DataFrame(data_multi) + >>> result_multi = encoder_multi.fit_transform(df_multi) + >>> result_multi[['user_id', 'device_id', 'timestamp', 'action', 'session_id']] + user_id device_id timestamp action session_id + 0 1 desktop 2024-01-01 10:05:00 view 0 + 1 1 desktop 2024-01-01 10:20:00 checkout 0 + 2 1 mobile 2024-01-01 10:00:00 view 1 + 3 1 mobile 2024-01-01 10:10:00 purchase 1 + 4 2 mobile 2024-01-01 10:00:00 login 2 + 5 2 mobile 2024-01-01 10:15:00 view 2 + + In this example: + - User 1 on "desktop" has session 0. + - User 1 on "mobile" has session 1 (different device, so separate session). + - User 2 on "mobile" has session 2 (different user). + + """ - def __init__(self, by, timestamp, session_gap=30): - self.by = by + def __init__(self, timestamp, by=None, session_gap=30): self.timestamp = timestamp + self.by = by self.session_gap = session_gap def fit(self, X, y=None): @@ -215,8 +266,18 @@ def fit_transform(self, X, y=None): """ self.all_inputs_ = sbd.column_names(X) # check that the required columns are present in the input dataframe - if self.by not in self.all_inputs_: + if self.by is not None: + if isinstance(self.by, str): + self.by_columns = [self.by] + elif isinstance(self.by, list): + self.by_columns = self.by + else: + raise TypeError("by must be a string or a list of strings") + if self.by is not None and any( + col not in self.all_inputs_ for col in self.by_columns + ): raise ValueError(f"Column '{self.by}' not found in input dataframe") + if self.timestamp not in self.all_inputs_: raise ValueError(f"Column '{self.timestamp}' not found in input dataframe") @@ -225,18 +286,9 @@ def fit_transform(self, X, y=None): raise ValueError("session_gap must be a positive number") # sort the input dataframe by the "by" and "timestamp" columns - X_sorted = sbd.sort(X, by=[self.by, self.timestamp]) # noqa + X_sorted = sbd.sort(X, by=self.by_columns + [self.timestamp]) - # convert by column to string if it's not already, to ensure - # that the diff operation works correctly - if not sbd.is_numeric(X_sorted[self.by]): - factorized_by = f"{self.by}_factorized" - X_factorized = sbd.with_columns( - X_sorted, **{factorized_by: _factorize_column(X_sorted, self.by)} - ) - else: - factorized_by = self.by - X_factorized = X_sorted + X_factorized, factorized_by = self._factorize_columns(X_sorted) # mark the start of a new session by checking the difference is_new_session = _check_is_new_session( X_factorized, factorized_by, self.timestamp, self.session_gap @@ -245,11 +297,8 @@ def fit_transform(self, X, y=None): X_with_session_id = _add_session_id(X_factorized, is_new_session) # drop the factorized "by" column if the original "by" column was not numeric - if factorized_by != self.by: - X_with_session_id = sbd.drop_columns( - X_with_session_id, columns=[factorized_by] - ) - + to_drop = [col for col in factorized_by if col not in self.by_columns] + X_with_session_id = sbd.drop_columns(X_with_session_id, to_drop) return X_with_session_id def transform(self, X): @@ -267,3 +316,18 @@ def transform(self, X): """ check_is_fitted() return self.fit_transform(X) + + def _factorize_columns(self, X): + # convert by column to string if it's not already, to ensure + # that the diff operation works correctly + + factorized_columns = { + f"{col}_factorized_skrub_{random_string()}": _factorize_column(X, col) + if not sbd.is_numeric(X[col]) + else X[col] + for col in self.by_columns + } + + X_factorized = sbd.with_columns(X, **factorized_columns) + + return X_factorized, list(factorized_columns.keys()) diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 19e99efad..d2037e4ab 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -47,6 +47,53 @@ def example_session_data(df_module): ) +@pytest.fixture +def example_session_data_multi_by(df_module): + """Create example session data where a user is identified by two columns. + + A user is uniquely identified by the combination of ``user_id`` and + ``device_id``. The same ``user_id`` on two different devices produces + independent sessions, which lets us verify that ``by`` accepts a list of + column names. + """ + timestamps = [] + user_ids = [] + device_ids = [] + + base_time = datetime.datetime(2024, 1, 1) + + # user 1, device "mobile": 2 sessions, 10 days apart, 4 events each + for session in range(2): + session_start = base_time + datetime.timedelta(days=session * 10) + for event in range(4): + timestamps.append(session_start + datetime.timedelta(minutes=event * 3)) + user_ids.append(1) + device_ids.append("mobile") + + # user 1, device "desktop": 1 session, 3 events + # (same user_id as above but different device → separate sessions) + session_start = base_time + datetime.timedelta(days=5) + for event in range(3): + timestamps.append(session_start + datetime.timedelta(minutes=event * 4)) + user_ids.append(1) + device_ids.append("desktop") + + # user 2, device "mobile": 1 session, 5 events + session_start = base_time + datetime.timedelta(days=20) + for event in range(5): + timestamps.append(session_start + datetime.timedelta(minutes=event * 2)) + user_ids.append(2) + device_ids.append("mobile") + + return df_module.make_dataframe( + { + "timestamp": timestamps, + "user_id": user_ids, + "device_id": device_ids, + } + ) + + @pytest.mark.parametrize( "by_column,expected_sessions,group_key_to_sessions", [ @@ -115,6 +162,54 @@ def test_session_encoder_different_users_different_sessions( assert len(sessions1.intersection(sessions2)) == 0 +def test_session_encoder_multi_by_columns(example_session_data_multi_by): + """Test sessionization when a user is identified by a combination of columns. + + The fixture has user_id=1 on two devices ("mobile" and "desktop"). When + ``by=["user_id", "device_id"]``, those two device contexts must be treated + as independent groups, producing separate session IDs even though they share + the same ``user_id``. + + Expected sessions: + - (user_id=1, device_id="mobile") → 2 sessions + - (user_id=1, device_id="desktop") → 1 session + - (user_id=2, device_id="mobile") → 1 session + Total: 4 sessions + """ + se = SessionEncoder( + by=["user_id", "device_id"], timestamp="timestamp", session_gap=30 + ) + result = se.fit_transform(example_session_data_multi_by) + + session_ids = sbd.to_list(sbd.col(result, "session_id")) + user_ids = sbd.to_list(sbd.col(result, "user_id")) + device_ids = sbd.to_list(sbd.col(result, "device_id")) + + # 4 distinct sessions overall + assert len(set(session_ids)) == 4 + + # create a dict that groups sessions by (user_id, device_id) pair + group_sessions: dict = {} + for uid, did, sid in zip(user_ids, device_ids, session_ids): + key = (uid, did) + # Each (user_id, device_id) pair should have its own set of session IDs + # We use a set to track unique session IDs for each group key + group_sessions.setdefault(key, set()).add(sid) + + # assert that each (user_id, device_id) pair has the expected number of sessions + assert len(group_sessions[(1, "mobile")]) == 2 + assert len(group_sessions[(1, "desktop")]) == 1 + assert len(group_sessions[(2, "mobile")]) == 1 + + # sessions belonging to different (user_id, device_id) pairs must be disjoint + keys = list(group_sessions) + # go through each pair of group keys (user_id, device_id) + for i, k1 in enumerate(keys): + for k2 in keys[i + 1 :]: + # check that the sets in group_sessions for different keys are disjoint + assert group_sessions[k1].isdisjoint(group_sessions[k2]) + + def test_session_encoder_multiple_users(df_module): """Test sessionization with multiple users interleaved.""" timestamps = [] From a70006fc2f1218f312b1b5ccaf9f7e338aa14b05 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 11:26:56 +0100 Subject: [PATCH 10/74] fixing optional by --- skrub/_session_encoder.py | 26 ++++++++++++++------- skrub/tests/test_session_encoder.py | 35 +++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 53243e4f6..b68375778 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -36,12 +36,14 @@ def _check_is_new_session(X, by, timestamp, session_gap): @_check_is_new_session.specialize("pandas") def _check_is_new_session_pandas(X, by, timestamp, session_gap): - # check if the "by" column changes - char_diff = (X[by].diff().fillna(0) > 0).any(axis=1) # check if the time difference between events exceeds the session gap time_diff = ( X[timestamp].astype(int).diff().fillna(0) // 10**3 > session_gap * 60 * 1000 ) + if not by: + return time_diff + # check if the "by" column changes + char_diff = (X[by].diff().fillna(0) > 0).any(axis=1) # a new session starts if either the "by" column changes or the time gap is # exceeded is_new_session = char_diff | time_diff @@ -50,14 +52,16 @@ def _check_is_new_session_pandas(X, by, timestamp, session_gap): @_check_is_new_session.specialize("polars") def _check_is_new_session_polars(X, by, timestamp, session_gap): - # check if the "by" column changes - char_diff = X.select( - pl.any_horizontal(pl.col(by).diff().fill_null(0) > 0) - ).to_series() # check if the time difference between events exceeds the session gap time_diff = ( X[timestamp].dt.epoch("ms").diff().fill_null(0) > session_gap * 60 * 1000 ) + if not by: + return time_diff + # check if the "by" column changes + char_diff = X.select( + pl.any_horizontal(pl.col(by).diff().fill_null(0) > 0) + ).to_series() # a new session starts if either the "by" column changes or the time gap is # exceeded is_new_session = char_diff | time_diff @@ -286,7 +290,12 @@ def fit_transform(self, X, y=None): raise ValueError("session_gap must be a positive number") # sort the input dataframe by the "by" and "timestamp" columns - X_sorted = sbd.sort(X, by=self.by_columns + [self.timestamp]) + sort_by = ( + self.by_columns + [self.timestamp] + if self.by is not None + else [self.timestamp] + ) + X_sorted = sbd.sort(X, by=sort_by) X_factorized, factorized_by = self._factorize_columns(X_sorted) # mark the start of a new session by checking the difference @@ -320,7 +329,8 @@ def transform(self, X): def _factorize_columns(self, X): # convert by column to string if it's not already, to ensure # that the diff operation works correctly - + if not self.by: + return X, [] factorized_columns = { f"{col}_factorized_skrub_{random_string()}": _factorize_column(X, col) if not sbd.is_numeric(X[col]) diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index d2037e4ab..8f0d4b029 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -271,6 +271,41 @@ def test_session_encoder_time_gap_threshold(df_module): assert len(set(session_ids_40)) == 1 +def test_session_encoder_no_user_column(df_module): + """Test sessionization without a user identifier column. + + When ``by`` is None, all events are treated as from the same "user", and + sessions are separated only by time gaps. + """ + timestamps = [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 10), # 10 min gap + datetime.datetime(2024, 1, 1, 10, 15), # 5 min gap (within 30 min) + datetime.datetime(2024, 1, 1, 11, 0), # 45 min gap (exceeds 30 min) + datetime.datetime(2024, 1, 1, 11, 10), # 10 min gap (within 30 min) + ] + + df = df_module.make_dataframe( + { + "timestamp": timestamps, + } + ) + + # Without 'by', sessions are separated only by time gaps + se = SessionEncoder(by=None, timestamp="timestamp", session_gap=30) + result = se.fit_transform(df) + + session_ids = sbd.to_list(sbd.col(result, "session_id")) + # Expected: 2 sessions (events 0-2 in session 0, event 3 starts new session) + # Then event 4 continues session 1 + assert len(set(session_ids)) == 2 + assert ( + session_ids[0] == session_ids[1] == session_ids[2] + ) # First 3 events in session 0 + assert session_ids[3] == session_ids[4] # Last 2 events in session 1 + assert session_ids[0] != session_ids[3] # Sessions are different + + def test_session_encoder_single_event(df_module): """Test sessionization with single event per user.""" df = df_module.make_dataframe( From 429f15552a838a08f05c65586365db87d861bac7 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 11:36:24 +0100 Subject: [PATCH 11/74] ddocs --- doc/api_reference.py | 1 + skrub/_session_encoder.py | 48 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/doc/api_reference.py b/doc/api_reference.py index b247ae572..5f96daf0e 100644 --- a/doc/api_reference.py +++ b/doc/api_reference.py @@ -91,6 +91,7 @@ "SimilarityEncoder", "ToCategorical", "DatetimeEncoder", + "SessionEncoder", "ToDatetime", "ToFloat", ], diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index b68375778..c4f007dc5 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -115,9 +115,9 @@ class SessionEncoder(TransformerMixin, BaseEstimator): """Encode sessions from a dataframe. A session is defined as a sequence of events where consecutive events are separated - by at most `session_gap` minutes. Additionally, it is possible to provide a column - or list of columns that identifies the user (specified by the `by` column). - When the time gap between consecutive events exceeds `session_gap`, or + by at most ``session_gap`` minutes. Additionally, it is possible to provide a column + or list of columns that identifies the user (specified by the ``by`` column). + When the time gap between consecutive events exceeds ``session_gap``, or when the user changes, a new session begins. Parameters @@ -143,6 +143,9 @@ class SessionEncoder(TransformerMixin, BaseEstimator): Examples -------- + Consider this example where we have a dataframe with user events, and we want + to identify sessions based on a 30-minute gap between events for each user: + >>> import pandas as pd >>> from datetime import datetime, timedelta >>> encoder = SessionEncoder(by='user_id', timestamp='timestamp', session_gap=30) @@ -178,6 +181,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): 4 bob 2024-01-01 10:20:00 purchase 2 In this example: + - Alice's first two events (10:00 and 10:05) are 5 minutes apart, so they form session 1. - Alice's third event (11:00) is 55 minutes after the previous one, exceeding @@ -221,10 +225,48 @@ class SessionEncoder(TransformerMixin, BaseEstimator): 5 2 mobile 2024-01-01 10:15:00 view 2 In this example: + - User 1 on "desktop" has session 0. - User 1 on "mobile" has session 1 (different device, so separate session). - User 2 on "mobile" has session 2 (different user). + You can also use SessionEncoder without a user identifier column. In this case, + sessions are separated only by time gaps. This is useful for analyzing a single + timeseries or events that don't have a user dimension: + + >>> encoder_no_by = SessionEncoder( + ... by=None, + ... timestamp='timestamp', + ... session_gap=30 + ... ) + + >>> # Create a sample dataframe with only timestamps + >>> data_no_by = { + ... 'timestamp': [ + ... pd.Timestamp('2024-01-01 10:00:00'), + ... pd.Timestamp('2024-01-01 10:10:00'), # 10 min gap + ... pd.Timestamp('2024-01-01 10:15:00'), # 5 min gap, still in session + ... pd.Timestamp('2024-01-01 11:00:00'), # 45 min gap, new session + ... pd.Timestamp('2024-01-01 11:10:00'), # 10 min gap, continue session + ... ], + ... 'event_type': ['start', 'action', 'action', 'restart', 'action'] + ... } + >>> df_no_by = pd.DataFrame(data_no_by) + >>> result_no_by = encoder_no_by.fit_transform(df_no_by) + >>> result_no_by[['timestamp', 'event_type', 'session_id']] + timestamp event_type session_id + 0 2024-01-01 10:00:00 start 0 + 1 2024-01-01 10:10:00 action 0 + 2 2024-01-01 10:15:00 action 0 + 3 2024-01-01 11:00:00 restart 1 + 4 2024-01-01 11:10:00 action 1 + + In this example: + + - Events at 10:00, 10:10, and 10:15 form session 0 (all gaps < 30 min). + - The event at 11:00 starts a new session 1 (45 min gap > 30 min). + - The event at 11:10 continues session 1 (10 min gap < 30 min). + """ From 23c4111fe5055972890b49ac947a57342d839d52 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 11:49:37 +0100 Subject: [PATCH 12/74] fixing a compatibility problem --- skrub/_session_encoder.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index c4f007dc5..a70f636d7 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -13,6 +13,7 @@ import numbers import pandas as pd +from packaging.version import parse from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted @@ -36,10 +37,18 @@ def _check_is_new_session(X, by, timestamp, session_gap): @_check_is_new_session.specialize("pandas") def _check_is_new_session_pandas(X, by, timestamp, session_gap): - # check if the time difference between events exceeds the session gap - time_diff = ( - X[timestamp].astype(int).diff().fillna(0) // 10**3 > session_gap * 60 * 1000 - ) + # pandas 3.0 changed the resolution of astype(int) for datetime columns from + # nanoseconds to milliseconds, so we need to adjust the time difference calculation + # accordingly + if parse(pd.__version__).major <= 2: + # check if the time difference between events exceeds the session gap + time_diff = ( + X[timestamp].astype(int).diff().fillna(0) // 10**6 > session_gap * 60 * 1000 + ) + else: + time_diff = ( + X[timestamp].astype(int).diff().fillna(0) // 10**3 > session_gap * 60 * 1000 + ) if not by: return time_diff # check if the "by" column changes From acb091f272465f7df407d0063da1b56e90b38e77 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 11:51:52 +0100 Subject: [PATCH 13/74] changelog --- CHANGES.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index cfe3fbcd2..108cf60c0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -20,8 +20,10 @@ New Features with 50-100 nodes or more). Moreover, the evaluation of large DataOps has also become faster. :pr:`1890` by :user:`Jérôme Dockès `. - The :class:`SessionEncoder` is now available. This encoder takes a dataframe with - a timestamp column and a grouping column (e.g., user ID) and computes sessions - for each value in the grouping column. Additional statistics can also be added. + a timestamp column and computes sessions based on the given session duration. + Additionally, it is possible to provide a ``by`` column or list of columns + (e.g., user ID or (user ID, user device)) to compute sessions for each grouping + value. :pr:`1930` by :user:`Riccardo Cappuzzo `. Changes From d69fb9e2ac41e316475a8f959343a3e4868ebbaf Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 13:35:51 +0100 Subject: [PATCH 14/74] improving tests --- skrub/_session_encoder.py | 29 ++++- skrub/tests/test_session_encoder.py | 164 +++++++++++++++++++++++++--- 2 files changed, 171 insertions(+), 22 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index a70f636d7..2197d2d9e 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -127,7 +127,8 @@ class SessionEncoder(TransformerMixin, BaseEstimator): by at most ``session_gap`` minutes. Additionally, it is possible to provide a column or list of columns that identifies the user (specified by the ``by`` column). When the time gap between consecutive events exceeds ``session_gap``, or - when the user changes, a new session begins. + when the user changes, a new session begins. All unrelated columns are passed + through unchanged. Parameters ---------- @@ -327,11 +328,11 @@ def fit_transform(self, X, y=None): elif isinstance(self.by, list): self.by_columns = self.by else: - raise TypeError("by must be a string or a list of strings") - if self.by is not None and any( - col not in self.all_inputs_ for col in self.by_columns - ): - raise ValueError(f"Column '{self.by}' not found in input dataframe") + raise TypeError("by must be a string, a list of strings, or None") + if self.by is not None: + for col in self.by_columns: + if col not in self.all_inputs_: + raise ValueError(f"Column '{col}' not found in input dataframe") if self.timestamp not in self.all_inputs_: raise ValueError(f"Column '{self.timestamp}' not found in input dataframe") @@ -392,3 +393,19 @@ def _factorize_columns(self, X): X_factorized = sbd.with_columns(X, **factorized_columns) return X_factorized, list(factorized_columns.keys()) + + def get_feature_names_out(self, input_features=None): + """Return the column names of the output of ``transform`` as a list of strings. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Ignored. + + Returns + ------- + list of strings + The column names. + """ + check_is_fitted() + return ["session_id"] diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 8f0d4b029..76b9d301e 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -1,9 +1,15 @@ import datetime +import numpy as np import pytest from skrub import SessionEncoder from skrub import _dataframe as sbd +from skrub._session_encoder import ( + _add_session_id, + _check_is_new_session, + _factorize_column, +) @pytest.fixture @@ -340,29 +346,53 @@ def test_session_encoder_empty_dataframe(df_module): assert "session_id" in sbd.column_names(result) -def test_session_encoder_missing_column_error(df_module): - """Test that missing columns raise appropriate errors.""" +@pytest.mark.parametrize( + "by_param,timestamp_param,expected_error_type,expected_error_match", + [ + ( + "wrong_column", + "timestamp", + ValueError, + "Column 'wrong_column' not found", + ), + ( + "user_id", + "wrong_column", + ValueError, + "Column 'wrong_column' not found", + ), + ( + ["wrong_column", "user_device"], + "timestamp", + ValueError, + "Column 'wrong_column' not found", + ), + ( + 23, # invalid type for 'by' + "timestamp", + TypeError, + "by must be a string, a list of strings, or None", + ), + ], +) +def test_session_encoder_missing_column_error( + df_module, by_param, timestamp_param, expected_error_type, expected_error_match +): + """Test that missing columns and invalid parameters raise appropriate errors.""" df = df_module.make_dataframe( { "timestamp": [datetime.datetime(2024, 1, 1)], - "wrong_column": [101], - } - ) - - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) - with pytest.raises(ValueError, match="Column 'user_id' not found"): - se.fit_transform(df) - - df2 = df_module.make_dataframe( - { - "wrong_timestamp": [datetime.datetime(2024, 1, 1)], "user_id": [101], + "user_device": ["mobile"], } ) - se2 = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) - with pytest.raises(ValueError, match="Column 'timestamp' not found"): - se2.fit_transform(df2) + se = SessionEncoder( + by=by_param, + timestamp=timestamp_param, + ) + with pytest.raises(expected_error_type, match=expected_error_match): + se.fit_transform(df) def test_session_encoder_invalid_parameters(df_module): @@ -435,3 +465,105 @@ def test_session_encoder_fit_and_transform(df_module): # Test that all_inputs_ is set after fit assert hasattr(se, "all_inputs_") + + +# --------------------------------------------------------------------------- +# Tests for the internal dispatched helper functions +# --------------------------------------------------------------------------- + + +def test_factorize_column_string(df_module): + """_factorize_column should map string values to consecutive integer codes.""" + df = df_module.make_dataframe({"user": ["alice", "bob", "alice", "charlie"]}) + codes = _factorize_column(df, "user") + + # alice appears first, so it should get code 0 + assert codes[0] == codes[2] # both "alice" → same code + assert codes[1] != codes[0] # "bob" differs from "alice" + assert codes[3] != codes[0] # "charlie" differs from "alice" + assert codes[1] != codes[3] # "bob" differs from "charlie" + assert all(isinstance(c, np.int64) for c in codes) + assert all(int(c) == expected for c, expected in zip(codes, [0, 1, 0, 2])) + + +def test_factorize_column_numeric(df_module): + """_factorize_column on a numeric column should return integer codes.""" + df = df_module.make_dataframe({"user_id": [10, 20, 10, 30]}) + codes = _factorize_column(df, "user_id") + + assert codes[0] == codes[2] # both 10 → same code + assert codes[1] != codes[0] # 20 differs from 10 + assert codes[3] != codes[0] # 30 differs from 10 + assert all(isinstance(c, np.int64) for c in codes) + assert all(int(c) == expected for c, expected in zip(codes, [0, 1, 0, 2])) + + +def test_add_session_id(df_module): + """_add_session_id should add a 'session_id' column computed as a cumulative + sum of the boolean ``is_new_session`` series. + + We obtain ``is_new_session`` from ``_check_is_new_session`` (no by-group) so + that the boolean series has the correct type for each dataframe backend. + """ + # Gaps: -, 5 min, 55 min, 10 min, 50 min + # is_new_session: [False, False, True, False, True] + # cumsum: [ 0, 0, 1, 1, 2 ] + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 5), # 5 min – same session + datetime.datetime(2024, 1, 1, 11, 0), # 55 min – new session + datetime.datetime(2024, 1, 1, 11, 10), # 10 min – same session + datetime.datetime(2024, 1, 1, 12, 0), # 50 min – new session + ] + } + ) + is_new_session = _check_is_new_session(df, [], "timestamp", 30) + result = _add_session_id(df, is_new_session) + + assert "session_id" in sbd.column_names(result) + assert sbd.to_list(sbd.col(result, "session_id")) == [0, 0, 1, 1, 2] + + +def test_check_is_new_session_no_by(df_module): + """_check_is_new_session with an empty by-list uses only the time gap.""" + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 10), # 10 min — within gap + datetime.datetime(2024, 1, 1, 11, 0), # 50 min — exceeds gap + datetime.datetime(2024, 1, 1, 11, 5), # 5 min — within gap + ] + } + ) + is_new = sbd.to_list(_check_is_new_session(df, [], "timestamp", 30)) + + # First row is never a new session (no previous row), all others depend on gap + assert not is_new[0] # (first row) + assert not is_new[1] # 10 min < 30 min + assert is_new[2] # 50 min > 30 min + assert not is_new[3] # 5 min < 30 min + + +def test_check_is_new_session_with_by(df_module): + """_check_is_new_session detects a new session when the group key changes, + even if the time gap is small.""" + df = df_module.make_dataframe( + { + "user_id": [1, 1, 2, 2], + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 5), # same user, 5 min gap + datetime.datetime(2024, 1, 1, 10, 6), # different user, 1 min gap + datetime.datetime(2024, 1, 1, 10, 10), # same user, 4 min gap + ], + } + ) + is_new = sbd.to_list(_check_is_new_session(df, ["user_id"], "timestamp", 30)) + + assert not is_new[0] # first row + assert not is_new[1] # same user, small gap + assert is_new[2] # user changed → new session + assert not is_new[3] # same user, small gap From e0199f5fd2ddac2520c709c059aeaf9d6a97e01a Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 13:46:33 +0100 Subject: [PATCH 15/74] renaming session id column --- skrub/_session_encoder.py | 25 ++++++++++----- skrub/tests/test_session_encoder.py | 48 ++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 2197d2d9e..d5b793b82 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -100,7 +100,7 @@ def _factorize_column_polars(X, column_name): @dispatch -def _add_session_id(X, is_new_session): +def _add_session_id(X, is_new_session, column_name): # Avoid circular import from ._dispatch import raise_dispatch_unregistered_type @@ -108,16 +108,16 @@ def _add_session_id(X, is_new_session): @_add_session_id.specialize("pandas") -def _add_session_id_pandas(X, is_new_session): +def _add_session_id_pandas(X, is_new_session, column_name): # Compute cumulative sum of is_new_session to create session IDs - X["session_id"] = is_new_session.cumsum() + X[column_name] = is_new_session.cumsum() return X @_add_session_id.specialize("polars") -def _add_session_id_polars(X, is_new_session): +def _add_session_id_polars(X, is_new_session, column_name): # Add session_id by computing cumulative sum of is_new_session - return X.with_columns(is_new_session.cum_sum().alias("session_id")) + return X.with_columns(is_new_session.cum_sum().alias(column_name)) class SessionEncoder(TransformerMixin, BaseEstimator): @@ -151,6 +151,10 @@ class SessionEncoder(TransformerMixin, BaseEstimator): all_inputs_ : list of str All column names in the input dataframe. + all_outputs_: list of str + All column names in the input dataframe plus the new column that identifies + the session, with name "{timestamp}_session_id". + Examples -------- Consider this example where we have a dataframe with user events, and we want @@ -355,11 +359,16 @@ def fit_transform(self, X, y=None): X_factorized, factorized_by, self.timestamp, self.session_gap ) # add the session id - X_with_session_id = _add_session_id(X_factorized, is_new_session) + session_col_name = f"{self.timestamp}_session_id" + X_with_session_id = _add_session_id( + X_factorized, is_new_session, session_col_name + ) # drop the factorized "by" column if the original "by" column was not numeric to_drop = [col for col in factorized_by if col not in self.by_columns] X_with_session_id = sbd.drop_columns(X_with_session_id, to_drop) + + self.all_outputs_ = sbd.column_names(X_with_session_id) return X_with_session_id def transform(self, X): @@ -407,5 +416,5 @@ def get_feature_names_out(self, input_features=None): list of strings The column names. """ - check_is_fitted() - return ["session_id"] + check_is_fitted(self) + return self.all_outputs_ diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 76b9d301e..8fc670ae5 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -116,12 +116,12 @@ def test_session_encoder_basic( result = se.fit_transform(example_session_data) # Check that we have the expected total number of sessions - session_ids = sbd.to_list(sbd.col(result, "session_id")) + session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) unique_sessions = set(session_ids) assert len(unique_sessions) == expected_sessions # content of the "session_id" column after sessionization - session_ids = sbd.to_list(sbd.col(result, "session_id")) + session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) # content of the "by" column (user_id or username) group_values = sbd.to_list(sbd.col(result, by_column)) @@ -150,7 +150,7 @@ def test_session_encoder_different_users_different_sessions( result = se.fit_transform(example_session_data) # content of the "session_id" column after sessionization - session_ids = sbd.to_list(sbd.col(result, "session_id")) + session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) # content of the "by" column (user_id or username) group_values = sbd.to_list(sbd.col(result, by_column)) @@ -187,7 +187,7 @@ def test_session_encoder_multi_by_columns(example_session_data_multi_by): ) result = se.fit_transform(example_session_data_multi_by) - session_ids = sbd.to_list(sbd.col(result, "session_id")) + session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) user_ids = sbd.to_list(sbd.col(result, "user_id")) device_ids = sbd.to_list(sbd.col(result, "device_id")) @@ -240,7 +240,7 @@ def test_session_encoder_multiple_users(df_module): # After sorting by user_id and timestamp, each user should have 1 session # since all their events are within 30 minutes - session_ids = sbd.col(result, "session_id") + session_ids = sbd.col(result, "timestamp_session_id") # The encoder sorts by user_id then timestamp, so events are grouped by user # Check that there are exactly 2 sessions (one per user) @@ -267,13 +267,13 @@ def test_session_encoder_time_gap_threshold(df_module): # With 20-minute gap: should create 2 sessions (split at 35-min gap) se_20 = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=20) result_20 = se_20.fit_transform(df) - session_ids_20 = sbd.to_list(sbd.col(result_20, "session_id")) + session_ids_20 = sbd.to_list(sbd.col(result_20, "timestamp_session_id")) assert len(set(session_ids_20)) == 2 # With 40-minute gap: should create 1 session (all gaps < 40 min) se_40 = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=40) result_40 = se_40.fit_transform(df) - session_ids_40 = sbd.to_list(sbd.col(result_40, "session_id")) + session_ids_40 = sbd.to_list(sbd.col(result_40, "timestamp_session_id")) assert len(set(session_ids_40)) == 1 @@ -301,7 +301,7 @@ def test_session_encoder_no_user_column(df_module): se = SessionEncoder(by=None, timestamp="timestamp", session_gap=30) result = se.fit_transform(df) - session_ids = sbd.to_list(sbd.col(result, "session_id")) + session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) # Expected: 2 sessions (events 0-2 in session 0, event 3 starts new session) # Then event 4 continues session 1 assert len(set(session_ids)) == 2 @@ -324,7 +324,7 @@ def test_session_encoder_single_event(df_module): se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) result = se.fit_transform(df) - session_ids = sbd.to_list(sbd.col(result, "session_id")) + session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) assert len(session_ids) == 1 # Single event should create one session assert session_ids[0] == 0 @@ -343,7 +343,7 @@ def test_session_encoder_empty_dataframe(df_module): result = se.fit_transform(df) assert sbd.shape(result)[0] == 0 - assert "session_id" in sbd.column_names(result) + assert "timestamp_session_id" in sbd.column_names(result) @pytest.mark.parametrize( @@ -442,7 +442,7 @@ def test_session_encoder_preserves_columns(df_module): assert "timestamp" in result_cols assert "user_id" in result_cols assert "extra_col" in result_cols - assert "session_id" in result_cols + assert "timestamp_session_id" in result_cols def test_session_encoder_fit_and_transform(df_module): @@ -467,6 +467,26 @@ def test_session_encoder_fit_and_transform(df_module): assert hasattr(se, "all_inputs_") +def test_get_feature_names(df_module): + """Test that get_feature_names returns the correct list of columns.""" + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 5), + ], + "user_id": [101, 101], + } + ) + + se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + se.fit(df) + feature_names = se.get_feature_names_out() + + # Should include original columns plus "session_id" + assert set(feature_names) == {"timestamp", "user_id", "timestamp_session_id"} + + # --------------------------------------------------------------------------- # Tests for the internal dispatched helper functions # --------------------------------------------------------------------------- @@ -520,10 +540,10 @@ def test_add_session_id(df_module): } ) is_new_session = _check_is_new_session(df, [], "timestamp", 30) - result = _add_session_id(df, is_new_session) + result = _add_session_id(df, is_new_session, "timestamp") - assert "session_id" in sbd.column_names(result) - assert sbd.to_list(sbd.col(result, "session_id")) == [0, 0, 1, 1, 2] + assert "timestamp" in sbd.column_names(result) + assert sbd.to_list(sbd.col(result, "timestamp")) == [0, 0, 1, 1, 2] def test_check_is_new_session_no_by(df_module): From 603a57b7dfee69078dd15b356c384c005f6d50a8 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 13:54:55 +0100 Subject: [PATCH 16/74] fixing some broken tests --- skrub/_session_encoder.py | 5 ++++- skrub/tests/test_session_encoder.py | 12 +++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index d5b793b82..d56fc8eb2 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -87,6 +87,8 @@ def _factorize_column(X, column_name): @_factorize_column.specialize("pandas") def _factorize_column_pandas(X, column_name): + if sbd.is_numeric(X[column_name]): + return X[column_name] codes, _ = pd.factorize(X[column_name]) return codes @@ -95,7 +97,8 @@ def _factorize_column_pandas(X, column_name): def _factorize_column_polars(X, column_name): import polars as pl - # TODO: update this according to the proper polars API + if sbd.is_numeric(X[column_name]): + return X[column_name] return X[column_name].cast(pl.Categorical).to_physical() diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 8fc670ae5..141544444 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -1,6 +1,5 @@ import datetime -import numpy as np import pytest from skrub import SessionEncoder @@ -483,7 +482,7 @@ def test_get_feature_names(df_module): se.fit(df) feature_names = se.get_feature_names_out() - # Should include original columns plus "session_id" + # Should include original columns plus "timestamp_session_id" assert set(feature_names) == {"timestamp", "user_id", "timestamp_session_id"} @@ -502,20 +501,15 @@ def test_factorize_column_string(df_module): assert codes[1] != codes[0] # "bob" differs from "alice" assert codes[3] != codes[0] # "charlie" differs from "alice" assert codes[1] != codes[3] # "bob" differs from "charlie" - assert all(isinstance(c, np.int64) for c in codes) assert all(int(c) == expected for c, expected in zip(codes, [0, 1, 0, 2])) def test_factorize_column_numeric(df_module): - """_factorize_column on a numeric column should return integer codes.""" + """_factorize_column on a numeric column should return the column unchanged.""" df = df_module.make_dataframe({"user_id": [10, 20, 10, 30]}) codes = _factorize_column(df, "user_id") - assert codes[0] == codes[2] # both 10 → same code - assert codes[1] != codes[0] # 20 differs from 10 - assert codes[3] != codes[0] # 30 differs from 10 - assert all(isinstance(c, np.int64) for c in codes) - assert all(int(c) == expected for c, expected in zip(codes, [0, 1, 0, 2])) + df_module.assert_column_equal(codes, df["user_id"]) def test_add_session_id(df_module): From 9d2c577cd5b192fcc3ab90a56bd78b4eaa102c5b Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 14:02:06 +0100 Subject: [PATCH 17/74] doctest --- skrub/_session_encoder.py | 48 +++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index d56fc8eb2..38fa99706 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -190,20 +190,20 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> result = encoder.fit_transform(df) >>> result - user_id timestamp action session_id - 0 alice 2024-01-01 10:00:00 login 0 - 1 alice 2024-01-01 10:05:00 view 0 - 2 alice 2024-01-01 11:00:00 logout 1 - 3 bob 2024-01-01 10:00:00 login 2 - 4 bob 2024-01-01 10:20:00 purchase 2 + user_id timestamp action timestamp_session_id + 0 alice 2024-01-01 10:00:00 login 0 + 1 alice 2024-01-01 10:05:00 view 0 + 2 alice 2024-01-01 11:00:00 logout 1 + 3 bob 2024-01-01 10:00:00 login 2 + 4 bob 2024-01-01 10:20:00 purchase 2 In this example: - Alice's first two events (10:00 and 10:05) are 5 minutes apart, so they form - session 1. + session 0. - Alice's third event (11:00) is 55 minutes after the previous one, exceeding - the 30-minute gap, so it forms a new session (session 2). - - Bob's events form session 3 (different user), with both events within the + the 30-minute gap, so it forms a new session (session 1). + - Bob's events form session 2 (different user), with both events within the 30-minute window. You can also identify users by multiple columns. For instance, the same user @@ -232,14 +232,14 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... } >>> df_multi = pd.DataFrame(data_multi) >>> result_multi = encoder_multi.fit_transform(df_multi) - >>> result_multi[['user_id', 'device_id', 'timestamp', 'action', 'session_id']] - user_id device_id timestamp action session_id - 0 1 desktop 2024-01-01 10:05:00 view 0 - 1 1 desktop 2024-01-01 10:20:00 checkout 0 - 2 1 mobile 2024-01-01 10:00:00 view 1 - 3 1 mobile 2024-01-01 10:10:00 purchase 1 - 4 2 mobile 2024-01-01 10:00:00 login 2 - 5 2 mobile 2024-01-01 10:15:00 view 2 + >>> result_multi + user_id device_id timestamp action timestamp_session_id + 0 1 desktop 2024-01-01 10:05:00 view 0 + 1 1 desktop 2024-01-01 10:20:00 checkout 0 + 2 1 mobile 2024-01-01 10:00:00 view 1 + 3 1 mobile 2024-01-01 10:10:00 purchase 1 + 4 2 mobile 2024-01-01 10:00:00 login 2 + 5 2 mobile 2024-01-01 10:15:00 view 2 In this example: @@ -270,13 +270,13 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... } >>> df_no_by = pd.DataFrame(data_no_by) >>> result_no_by = encoder_no_by.fit_transform(df_no_by) - >>> result_no_by[['timestamp', 'event_type', 'session_id']] - timestamp event_type session_id - 0 2024-01-01 10:00:00 start 0 - 1 2024-01-01 10:10:00 action 0 - 2 2024-01-01 10:15:00 action 0 - 3 2024-01-01 11:00:00 restart 1 - 4 2024-01-01 11:10:00 action 1 + >>> result_no_by + timestamp event_type timestamp_session_id + 0 2024-01-01 10:00:00 start 0 + 1 2024-01-01 10:10:00 action 0 + 2 2024-01-01 10:15:00 action 0 + 3 2024-01-01 11:00:00 restart 1 + 4 2024-01-01 11:10:00 action 1 In this example: From 8d869b22925c81e3cf4cc221f001ee513581cfd3 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 26 Feb 2026 14:29:51 +0100 Subject: [PATCH 18/74] testing error dispatch --- skrub/tests/test_session_encoder.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 141544444..91a3cf249 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -1,5 +1,7 @@ import datetime +from functools import partial +import numpy as np import pytest from skrub import SessionEncoder @@ -581,3 +583,16 @@ def test_check_is_new_session_with_by(df_module): assert not is_new[1] # same user, small gap assert is_new[2] # user changed → new session assert not is_new[3] # same user, small gap + + +@pytest.mark.parametrize( + "func", + ( + partial(_check_is_new_session, by=None, timestamp="timestamp", session_gap=30), + partial(_factorize_column, column_name="user_id"), + partial(_add_session_id, is_new_session=None, column_name="timestamp"), + ), +) +def test_error_dispatch(func): + with pytest.raises(TypeError, match="Expecting a Pandas or Polars Dataframe"): + func(np.array([1])) From 01fed5fc2bef9d139a0fb5a607d7f3c8ec530aa0 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 27 Feb 2026 11:05:25 +0100 Subject: [PATCH 19/74] addressing some of the coments --- skrub/_session_encoder.py | 162 +++++++++++++++++++------------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 38fa99706..881880e97 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -11,6 +11,7 @@ """ import numbers +from collections.abc import Iterable import pandas as pd from packaging.version import parse @@ -18,7 +19,7 @@ from sklearn.utils.validation import check_is_fitted from . import _dataframe as sbd -from ._dispatch import dispatch +from ._dispatch import dispatch, raise_dispatch_unregistered_type from ._utils import random_string try: @@ -28,15 +29,12 @@ @dispatch -def _check_is_new_session(X, by, timestamp, session_gap): - # Avoid circular import - from ._dispatch import raise_dispatch_unregistered_type - +def _check_is_new_session(X, group_by, timestamp, session_gap): raise_dispatch_unregistered_type(X, kind="Dataframe") @_check_is_new_session.specialize("pandas") -def _check_is_new_session_pandas(X, by, timestamp, session_gap): +def _check_is_new_session_pandas(X, group_by, timestamp, session_gap): # pandas 3.0 changed the resolution of astype(int) for datetime columns from # nanoseconds to milliseconds, so we need to adjust the time difference calculation # accordingly @@ -49,39 +47,54 @@ def _check_is_new_session_pandas(X, by, timestamp, session_gap): time_diff = ( X[timestamp].astype(int).diff().fillna(0) // 10**3 > session_gap * 60 * 1000 ) - if not by: + if not group_by: return time_diff - # check if the "by" column changes - char_diff = (X[by].diff().fillna(0) > 0).any(axis=1) - # a new session starts if either the "by" column changes or the time gap is + # check if the "group_by" column changes + char_diff = (X[group_by].diff().fillna(0) > 0).any(axis=1) + # a new session starts if either the "group_by" column changes or the time gap is # exceeded is_new_session = char_diff | time_diff return is_new_session @_check_is_new_session.specialize("polars") -def _check_is_new_session_polars(X, by, timestamp, session_gap): +def _check_is_new_session_polars(X, group_by, timestamp, session_gap): # check if the time difference between events exceeds the session gap time_diff = ( X[timestamp].dt.epoch("ms").diff().fill_null(0) > session_gap * 60 * 1000 ) - if not by: + if not group_by: return time_diff - # check if the "by" column changes + # check if the "group_by" column changes char_diff = X.select( - pl.any_horizontal(pl.col(by).diff().fill_null(0) > 0) + pl.any_horizontal(pl.col(group_by).diff().fill_null(0) > 0) ).to_series() - # a new session starts if either the "by" column changes or the time gap is + # a new session starts if either the "group_by" column changes or the time gap is # exceeded is_new_session = char_diff | time_diff return is_new_session @dispatch -def _factorize_column(X, column_name): - # Avoid circular import - from ._dispatch import raise_dispatch_unregistered_type +def _add_session_id(X, is_new_session, column_name): + raise_dispatch_unregistered_type(X, kind="Dataframe") + + +@_add_session_id.specialize("pandas") +def _add_session_id_pandas(X, is_new_session, column_name): + # Compute cumulative sum of is_new_session to create session IDs + X[column_name] = is_new_session.cumsum() + return X + +@_add_session_id.specialize("polars") +def _add_session_id_polars(X, is_new_session, column_name): + # Add session_id by computing cumulative sum of is_new_session + return X.with_columns(is_new_session.cum_sum().alias(column_name)) + + +@dispatch +def _factorize_column(X, column_name): raise_dispatch_unregistered_type(X, kind="Dataframe") @@ -102,47 +115,27 @@ def _factorize_column_polars(X, column_name): return X[column_name].cast(pl.Categorical).to_physical() -@dispatch -def _add_session_id(X, is_new_session, column_name): - # Avoid circular import - from ._dispatch import raise_dispatch_unregistered_type - - raise_dispatch_unregistered_type(X, kind="Dataframe") - - -@_add_session_id.specialize("pandas") -def _add_session_id_pandas(X, is_new_session, column_name): - # Compute cumulative sum of is_new_session to create session IDs - X[column_name] = is_new_session.cumsum() - return X - - -@_add_session_id.specialize("polars") -def _add_session_id_polars(X, is_new_session, column_name): - # Add session_id by computing cumulative sum of is_new_session - return X.with_columns(is_new_session.cum_sum().alias(column_name)) - - class SessionEncoder(TransformerMixin, BaseEstimator): """Encode sessions from a dataframe. A session is defined as a sequence of events where consecutive events are separated by at most ``session_gap`` minutes. Additionally, it is possible to provide a column - or list of columns that identifies the user (specified by the ``by`` column). + or list of columns that identifies the user (specified by the ``group_by`` column). When the time gap between consecutive events exceeds ``session_gap``, or when the user changes, a new session begins. All unrelated columns are passed through unchanged. Parameters ---------- - timestamp : str + timestamp_col : str The name of the column that identifies the time of an event. This column is used to determine the start and end of a session. - by : optional[str, list[str]], default=None - The name of the column, or list of columns, that identifies a user. This - parameter is used to group events into sessions. If not provided, all - events are considered to belong to the same user. + group_by : optional[str, list[str]], default=None + The name of the column, or list of columns, to group by. This parameter + is used to group events into sessions by, for example, user. If not + provided, sessions are detected based on the time gap between events, and all + events are considered to belong to the same user (or group). session_gap : int, default=30 The maximum gap (in minutes) between events in a session. If the gap @@ -165,7 +158,9 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> import pandas as pd >>> from datetime import datetime, timedelta - >>> encoder = SessionEncoder(by='user_id', timestamp='timestamp', session_gap=30) + >>> encoder = SessionEncoder( + ... group_by='user_id', timestamp_col='timestamp', session_gap=30 + ... ) >>> # Create a sample dataframe with events from different users >>> data = { @@ -193,7 +188,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): user_id timestamp action timestamp_session_id 0 alice 2024-01-01 10:00:00 login 0 1 alice 2024-01-01 10:05:00 view 0 - 2 alice 2024-01-01 11:00:00 logout 1 + 2 alice 2024-01-01 11:00:00 purchase 1 3 bob 2024-01-01 10:00:00 login 2 4 bob 2024-01-01 10:20:00 purchase 2 @@ -210,8 +205,8 @@ class SessionEncoder(TransformerMixin, BaseEstimator): on different devices should have separate sessions: >>> encoder_multi = SessionEncoder( - ... by=['user_id', 'device_id'], - ... timestamp='timestamp', + ... group_by=['user_id', 'device_id'], + ... timestamp_col='timestamp', ... session_gap=30 ... ) @@ -251,14 +246,14 @@ class SessionEncoder(TransformerMixin, BaseEstimator): sessions are separated only by time gaps. This is useful for analyzing a single timeseries or events that don't have a user dimension: - >>> encoder_no_by = SessionEncoder( - ... by=None, - ... timestamp='timestamp', + >>> encoder_no_group = SessionEncoder( + ... group_by=None, + ... timestamp_col='timestamp', ... session_gap=30 ... ) >>> # Create a sample dataframe with only timestamps - >>> data_no_by = { + >>> data_no_group = { ... 'timestamp': [ ... pd.Timestamp('2024-01-01 10:00:00'), ... pd.Timestamp('2024-01-01 10:10:00'), # 10 min gap @@ -268,9 +263,9 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... ], ... 'event_type': ['start', 'action', 'action', 'restart', 'action'] ... } - >>> df_no_by = pd.DataFrame(data_no_by) - >>> result_no_by = encoder_no_by.fit_transform(df_no_by) - >>> result_no_by + >>> df_no_group = pd.DataFrame(data_no_group) + >>> result_no_group = encoder_no_group.fit_transform(df_no_group) + >>> result_no_group timestamp event_type timestamp_session_id 0 2024-01-01 10:00:00 start 0 1 2024-01-01 10:10:00 action 0 @@ -287,9 +282,9 @@ class SessionEncoder(TransformerMixin, BaseEstimator): """ - def __init__(self, timestamp, by=None, session_gap=30): - self.timestamp = timestamp - self.by = by + def __init__(self, timestamp_col, group_by=None, session_gap=30): + self.timestamp_col = timestamp_col + self.group_by = group_by self.session_gap = session_gap def fit(self, X, y=None): @@ -329,46 +324,51 @@ def fit_transform(self, X, y=None): """ self.all_inputs_ = sbd.column_names(X) # check that the required columns are present in the input dataframe - if self.by is not None: - if isinstance(self.by, str): - self.by_columns = [self.by] - elif isinstance(self.by, list): - self.by_columns = self.by + if self.group_by is not None: + if isinstance(self.group_by, str): + self.group_by_columns = [self.group_by] + elif isinstance(self.group_by, Iterable) and not isinstance( + self.group_by, str + ): + self.group_by_columns = list(self.group_by) else: - raise TypeError("by must be a string, a list of strings, or None") - if self.by is not None: - for col in self.by_columns: + raise TypeError("group_by must be a string, a list of strings, or None") + if self.group_by is not None: + for col in self.group_by_columns: if col not in self.all_inputs_: raise ValueError(f"Column '{col}' not found in input dataframe") - if self.timestamp not in self.all_inputs_: - raise ValueError(f"Column '{self.timestamp}' not found in input dataframe") + if self.timestamp_col not in self.all_inputs_: + raise ValueError( + f"Column '{self.timestamp_col}' not found in input dataframe" + ) # check the correctness of the values of session_gap if not isinstance(self.session_gap, numbers.Number) or self.session_gap <= 0: raise ValueError("session_gap must be a positive number") - # sort the input dataframe by the "by" and "timestamp" columns + # sort the input dataframe by the "group_by" and "timestamp" columns sort_by = ( - self.by_columns + [self.timestamp] - if self.by is not None - else [self.timestamp] + self.group_by_columns + [self.timestamp_col] + if self.group_by is not None + else [self.timestamp_col] ) X_sorted = sbd.sort(X, by=sort_by) X_factorized, factorized_by = self._factorize_columns(X_sorted) # mark the start of a new session by checking the difference is_new_session = _check_is_new_session( - X_factorized, factorized_by, self.timestamp, self.session_gap + X_factorized, factorized_by, self.timestamp_col, self.session_gap ) # add the session id - session_col_name = f"{self.timestamp}_session_id" + session_col_name = f"{self.timestamp_col}_session_id" X_with_session_id = _add_session_id( X_factorized, is_new_session, session_col_name ) - # drop the factorized "by" column if the original "by" column was not numeric - to_drop = [col for col in factorized_by if col not in self.by_columns] + # drop the factorized "group_by" column if the original "group_by" + # column was not numeric + to_drop = [col for col in factorized_by if col not in self.group_by_columns] X_with_session_id = sbd.drop_columns(X_with_session_id, to_drop) self.all_outputs_ = sbd.column_names(X_with_session_id) @@ -387,19 +387,19 @@ def transform(self, X): pandas.DataFrame or polars.DataFrame The transformed dataframe with session information. """ - check_is_fitted() + check_is_fitted(self) return self.fit_transform(X) def _factorize_columns(self, X): - # convert by column to string if it's not already, to ensure + # convert group_by column to string if it's not already, to ensure # that the diff operation works correctly - if not self.by: + if not self.group_by: return X, [] factorized_columns = { f"{col}_factorized_skrub_{random_string()}": _factorize_column(X, col) if not sbd.is_numeric(X[col]) else X[col] - for col in self.by_columns + for col in self.group_by_columns } X_factorized = sbd.with_columns(X, **factorized_columns) From 746109b6b0429bc67b4f216d2875ae1c5bdb8131 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 27 Feb 2026 11:50:29 +0100 Subject: [PATCH 20/74] addressing more comments --- skrub/_session_encoder.py | 83 ++++++++----------- skrub/tests/test_session_encoder.py | 124 +++++++++++++--------------- 2 files changed, 93 insertions(+), 114 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 881880e97..a46ef5035 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -29,67 +29,58 @@ @dispatch -def _check_is_new_session(X, group_by, timestamp, session_gap): +def _add_session_id(X, group_by, timestamp_col, session_gap): raise_dispatch_unregistered_type(X, kind="Dataframe") -@_check_is_new_session.specialize("pandas") -def _check_is_new_session_pandas(X, group_by, timestamp, session_gap): +@_add_session_id.specialize("pandas") +def _add_session_id_pandas(X, group_by, timestamp_col, session_gap): # pandas 3.0 changed the resolution of astype(int) for datetime columns from # nanoseconds to milliseconds, so we need to adjust the time difference calculation # accordingly if parse(pd.__version__).major <= 2: # check if the time difference between events exceeds the session gap time_diff = ( - X[timestamp].astype(int).diff().fillna(0) // 10**6 > session_gap * 60 * 1000 + X[timestamp_col].astype(int).diff().fillna(0) // 10**6 + > session_gap * 60 * 1000 ) else: time_diff = ( - X[timestamp].astype(int).diff().fillna(0) // 10**3 > session_gap * 60 * 1000 + X[timestamp_col].astype(int).diff().fillna(0) // 10**3 + > session_gap * 60 * 1000 ) - if not group_by: - return time_diff - # check if the "group_by" column changes - char_diff = (X[group_by].diff().fillna(0) > 0).any(axis=1) - # a new session starts if either the "group_by" column changes or the time gap is - # exceeded - is_new_session = char_diff | time_diff - return is_new_session + if group_by: + # check if the "group_by" column changes + group_diff = (X[group_by].diff().fillna(0) != 0).any(axis=1) + # a new session starts if either the "group_by" column changes or the time + # gap is exceeded + is_new_session = group_diff | time_diff + else: + is_new_session = time_diff + # Compute cumulative sum of is_new_session to create session IDs + column_name = f"{timestamp_col}_session_id" + X[column_name] = is_new_session.cumsum() + return X -@_check_is_new_session.specialize("polars") -def _check_is_new_session_polars(X, group_by, timestamp, session_gap): +@_add_session_id.specialize("polars") +def _add_session_id_polars(X, group_by, timestamp_col, session_gap): # check if the time difference between events exceeds the session gap time_diff = ( - X[timestamp].dt.epoch("ms").diff().fill_null(0) > session_gap * 60 * 1000 + X[timestamp_col].dt.epoch("ms").diff().fill_null(0) > session_gap * 60 * 1000 ) - if not group_by: - return time_diff - # check if the "group_by" column changes - char_diff = X.select( - pl.any_horizontal(pl.col(group_by).diff().fill_null(0) > 0) - ).to_series() + if group_by: + # check if the "group_by" column changes + group_diff = X.select( + pl.any_horizontal(pl.col(group_by).diff().fill_null(0) != 0) + ).to_series() + is_new_session = group_diff | time_diff + else: + is_new_session = time_diff # a new session starts if either the "group_by" column changes or the time gap is # exceeded - is_new_session = char_diff | time_diff - return is_new_session - - -@dispatch -def _add_session_id(X, is_new_session, column_name): - raise_dispatch_unregistered_type(X, kind="Dataframe") - - -@_add_session_id.specialize("pandas") -def _add_session_id_pandas(X, is_new_session, column_name): - # Compute cumulative sum of is_new_session to create session IDs - X[column_name] = is_new_session.cumsum() - return X - - -@_add_session_id.specialize("polars") -def _add_session_id_polars(X, is_new_session, column_name): # Add session_id by computing cumulative sum of is_new_session + column_name = f"{timestamp_col}_session_id" return X.with_columns(is_new_session.cum_sum().alias(column_name)) @@ -172,14 +163,14 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... pd.Timestamp('2024-01-01 10:00:00'), # Different user ... pd.Timestamp('2024-01-01 10:20:00'), # 20 min later, same session ... ], - ... 'action': ['login', 'view', 'logout', 'login', 'purchase'] + ... 'action': ['login', 'view', 'purchase', 'login', 'purchase'] ... } >>> df = pd.DataFrame(data) >>> df user_id timestamp action 0 alice 2024-01-01 10:00:00 login 1 alice 2024-01-01 10:05:00 view - 2 alice 2024-01-01 11:00:00 logout + 2 alice 2024-01-01 11:00:00 purchase 3 bob 2024-01-01 10:00:00 login 4 bob 2024-01-01 10:20:00 purchase @@ -356,16 +347,10 @@ def fit_transform(self, X, y=None): X_sorted = sbd.sort(X, by=sort_by) X_factorized, factorized_by = self._factorize_columns(X_sorted) - # mark the start of a new session by checking the difference - is_new_session = _check_is_new_session( - X_factorized, factorized_by, self.timestamp_col, self.session_gap - ) # add the session id - session_col_name = f"{self.timestamp_col}_session_id" X_with_session_id = _add_session_id( - X_factorized, is_new_session, session_col_name + X_factorized, factorized_by, self.timestamp_col, self.session_gap ) - # drop the factorized "group_by" column if the original "group_by" # column was not numeric to_drop = [col for col in factorized_by if col not in self.group_by_columns] diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 91a3cf249..d4e08d403 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -8,7 +8,6 @@ from skrub import _dataframe as sbd from skrub._session_encoder import ( _add_session_id, - _check_is_new_session, _factorize_column, ) @@ -60,7 +59,7 @@ def example_session_data_multi_by(df_module): A user is uniquely identified by the combination of ``user_id`` and ``device_id``. The same ``user_id`` on two different devices produces - independent sessions, which lets us verify that ``by`` accepts a list of + independent sessions, which lets us verify that ``group_by`` accepts a list of column names. """ timestamps = [] @@ -113,7 +112,7 @@ def test_session_encoder_basic( ): """Test basic sessionization grouping by user_id or username.""" # Apply SessionEncoder grouping by the specified column - se = SessionEncoder(by=by_column, timestamp="timestamp", session_gap=30) + se = SessionEncoder(group_by=by_column, timestamp_col="timestamp", session_gap=30) result = se.fit_transform(example_session_data) # Check that we have the expected total number of sessions @@ -147,7 +146,7 @@ def test_session_encoder_different_users_different_sessions( ): """Test that different users/groups have different session IDs.""" # Apply SessionEncoder - se = SessionEncoder(by=by_column, timestamp="timestamp", session_gap=30) + se = SessionEncoder(group_by=by_column, timestamp_col="timestamp", session_gap=30) result = se.fit_transform(example_session_data) # content of the "session_id" column after sessionization @@ -173,7 +172,7 @@ def test_session_encoder_multi_by_columns(example_session_data_multi_by): """Test sessionization when a user is identified by a combination of columns. The fixture has user_id=1 on two devices ("mobile" and "desktop"). When - ``by=["user_id", "device_id"]``, those two device contexts must be treated + ``group_by=["user_id", "device_id"]``, those two device contexts must be treated as independent groups, producing separate session IDs even though they share the same ``user_id``. @@ -184,7 +183,7 @@ def test_session_encoder_multi_by_columns(example_session_data_multi_by): Total: 4 sessions """ se = SessionEncoder( - by=["user_id", "device_id"], timestamp="timestamp", session_gap=30 + group_by=["user_id", "device_id"], timestamp_col="timestamp", session_gap=30 ) result = se.fit_transform(example_session_data_multi_by) @@ -236,7 +235,7 @@ def test_session_encoder_multiple_users(df_module): } ) - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) # After sorting by user_id and timestamp, each user should have 1 session @@ -266,13 +265,17 @@ def test_session_encoder_time_gap_threshold(df_module): ) # With 20-minute gap: should create 2 sessions (split at 35-min gap) - se_20 = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=20) + se_20 = SessionEncoder( + group_by="user_id", timestamp_col="timestamp", session_gap=20 + ) result_20 = se_20.fit_transform(df) session_ids_20 = sbd.to_list(sbd.col(result_20, "timestamp_session_id")) assert len(set(session_ids_20)) == 2 # With 40-minute gap: should create 1 session (all gaps < 40 min) - se_40 = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=40) + se_40 = SessionEncoder( + group_by="user_id", timestamp_col="timestamp", session_gap=40 + ) result_40 = se_40.fit_transform(df) session_ids_40 = sbd.to_list(sbd.col(result_40, "timestamp_session_id")) assert len(set(session_ids_40)) == 1 @@ -281,7 +284,7 @@ def test_session_encoder_time_gap_threshold(df_module): def test_session_encoder_no_user_column(df_module): """Test sessionization without a user identifier column. - When ``by`` is None, all events are treated as from the same "user", and + When ``group_by`` is None, all events are treated as from the same "user", and sessions are separated only by time gaps. """ timestamps = [ @@ -298,8 +301,8 @@ def test_session_encoder_no_user_column(df_module): } ) - # Without 'by', sessions are separated only by time gaps - se = SessionEncoder(by=None, timestamp="timestamp", session_gap=30) + # Without 'group_by', sessions are separated only by time gaps + se = SessionEncoder(group_by=None, timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) @@ -322,7 +325,7 @@ def test_session_encoder_single_event(df_module): } ) - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) @@ -340,7 +343,7 @@ def test_session_encoder_empty_dataframe(df_module): } ) - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) assert sbd.shape(result)[0] == 0 @@ -348,7 +351,7 @@ def test_session_encoder_empty_dataframe(df_module): @pytest.mark.parametrize( - "by_param,timestamp_param,expected_error_type,expected_error_match", + "group_by_param,timestamp_col_param,expected_error_type,expected_error_match", [ ( "wrong_column", @@ -369,15 +372,19 @@ def test_session_encoder_empty_dataframe(df_module): "Column 'wrong_column' not found", ), ( - 23, # invalid type for 'by' + 23, # invalid type for 'group_by' "timestamp", TypeError, - "by must be a string, a list of strings, or None", + "group_by must be a string, a list of strings, or None", ), ], ) def test_session_encoder_missing_column_error( - df_module, by_param, timestamp_param, expected_error_type, expected_error_match + df_module, + group_by_param, + timestamp_col_param, + expected_error_type, + expected_error_match, ): """Test that missing columns and invalid parameters raise appropriate errors.""" df = df_module.make_dataframe( @@ -389,8 +396,8 @@ def test_session_encoder_missing_column_error( ) se = SessionEncoder( - by=by_param, - timestamp=timestamp_param, + group_by=group_by_param, + timestamp_col=timestamp_col_param, ) with pytest.raises(expected_error_type, match=expected_error_match): se.fit_transform(df) @@ -406,18 +413,22 @@ def test_session_encoder_invalid_parameters(df_module): ) # Test negative session_gap - se_negative = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=-10) + se_negative = SessionEncoder( + group_by="user_id", timestamp_col="timestamp", session_gap=-10 + ) with pytest.raises(ValueError, match="session_gap must be a positive number"): se_negative.fit_transform(df) # Test zero session_gap - se_zero = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=0) + se_zero = SessionEncoder( + group_by="user_id", timestamp_col="timestamp", session_gap=0 + ) with pytest.raises(ValueError, match="session_gap must be a positive number"): se_zero.fit_transform(df) # Test non-numeric session_gap se_non_numeric = SessionEncoder( - by="user_id", timestamp="timestamp", session_gap="thirty" + group_by="user_id", timestamp_col="timestamp", session_gap="thirty" ) with pytest.raises(ValueError, match="session_gap must be a positive number"): se_non_numeric.fit_transform(df) @@ -436,7 +447,7 @@ def test_session_encoder_preserves_columns(df_module): } ) - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) result_cols = sbd.column_names(result) @@ -458,7 +469,7 @@ def test_session_encoder_fit_and_transform(df_module): } ) - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) # Test fit returns self se_fitted = se.fit(df) @@ -480,7 +491,7 @@ def test_get_feature_names(df_module): } ) - se = SessionEncoder(by="user_id", timestamp="timestamp", session_gap=30) + se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) se.fit(df) feature_names = se.get_feature_names_out() @@ -514,36 +525,8 @@ def test_factorize_column_numeric(df_module): df_module.assert_column_equal(codes, df["user_id"]) -def test_add_session_id(df_module): - """_add_session_id should add a 'session_id' column computed as a cumulative - sum of the boolean ``is_new_session`` series. - - We obtain ``is_new_session`` from ``_check_is_new_session`` (no by-group) so - that the boolean series has the correct type for each dataframe backend. - """ - # Gaps: -, 5 min, 55 min, 10 min, 50 min - # is_new_session: [False, False, True, False, True] - # cumsum: [ 0, 0, 1, 1, 2 ] - df = df_module.make_dataframe( - { - "timestamp": [ - datetime.datetime(2024, 1, 1, 10, 0), - datetime.datetime(2024, 1, 1, 10, 5), # 5 min – same session - datetime.datetime(2024, 1, 1, 11, 0), # 55 min – new session - datetime.datetime(2024, 1, 1, 11, 10), # 10 min – same session - datetime.datetime(2024, 1, 1, 12, 0), # 50 min – new session - ] - } - ) - is_new_session = _check_is_new_session(df, [], "timestamp", 30) - result = _add_session_id(df, is_new_session, "timestamp") - - assert "timestamp" in sbd.column_names(result) - assert sbd.to_list(sbd.col(result, "timestamp")) == [0, 0, 1, 1, 2] - - def test_check_is_new_session_no_by(df_module): - """_check_is_new_session with an empty by-list uses only the time gap.""" + """_check_is_new_session with an empty group_by-list uses only the time gap.""" df = df_module.make_dataframe( { "timestamp": [ @@ -554,7 +537,7 @@ def test_check_is_new_session_no_by(df_module): ] } ) - is_new = sbd.to_list(_check_is_new_session(df, [], "timestamp", 30)) + is_new = sbd.to_list(_add_session_id(df, [], "timestamp", 30)) # First row is never a new session (no previous row), all others depend on gap assert not is_new[0] # (first row) @@ -564,8 +547,18 @@ def test_check_is_new_session_no_by(df_module): def test_check_is_new_session_with_by(df_module): - """_check_is_new_session detects a new session when the group key changes, - even if the time gap is small.""" + """_add_session_id returns a dataframe with a ``timestamp_session_id`` + column when a group_by-list is provided. A new session starts when the group key + changes (even for a tiny time gap) or when the time gap exceeds + ``session_gap``. + + Data layout (already sorted by user_id, timestamp): + row 0: user 1, 10:00 – first row, session 0 + row 1: user 1, 10:05 – same user, 5 min gap → still session 0 + row 2: user 2, 10:06 – user changed, 1 min gap → new session 1 + row 3: user 2, 10:10 – same user, 4 min gap → still session 1 + Expected session_ids: [0, 0, 1, 1] + """ df = df_module.make_dataframe( { "user_id": [1, 1, 2, 2], @@ -577,20 +570,21 @@ def test_check_is_new_session_with_by(df_module): ], } ) - is_new = sbd.to_list(_check_is_new_session(df, ["user_id"], "timestamp", 30)) + result = _add_session_id(df, ["user_id"], "timestamp", 30) - assert not is_new[0] # first row - assert not is_new[1] # same user, small gap - assert is_new[2] # user changed → new session - assert not is_new[3] # same user, small gap + # _add_session_id now returns the full dataframe with session_id added + assert "timestamp_session_id" in sbd.column_names(result) + session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) + assert session_ids == [0, 0, 1, 1] @pytest.mark.parametrize( "func", ( - partial(_check_is_new_session, by=None, timestamp="timestamp", session_gap=30), + partial( + _add_session_id, group_by=[], timestamp_col="timestamp", session_gap=30 + ), partial(_factorize_column, column_name="user_id"), - partial(_add_session_id, is_new_session=None, column_name="timestamp"), ), ) def test_error_dispatch(func): From 71302b15bd675fc6e0e6a7b5d912d591fef1dd61 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 27 Feb 2026 12:10:19 +0100 Subject: [PATCH 21/74] fixing test --- skrub/_session_encoder.py | 4 ++-- skrub/tests/test_session_encoder.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index a46ef5035..778ad594b 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -74,11 +74,11 @@ def _add_session_id_polars(X, group_by, timestamp_col, session_gap): group_diff = X.select( pl.any_horizontal(pl.col(group_by).diff().fill_null(0) != 0) ).to_series() + # a new session starts if either the "group_by" column changes or the time + # gap is exceeded is_new_session = group_diff | time_diff else: is_new_session = time_diff - # a new session starts if either the "group_by" column changes or the time gap is - # exceeded # Add session_id by computing cumulative sum of is_new_session column_name = f"{timestamp_col}_session_id" return X.with_columns(is_new_session.cum_sum().alias(column_name)) diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index d4e08d403..e4852a5f3 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -537,13 +537,11 @@ def test_check_is_new_session_no_by(df_module): ] } ) - is_new = sbd.to_list(_add_session_id(df, [], "timestamp", 30)) - - # First row is never a new session (no previous row), all others depend on gap - assert not is_new[0] # (first row) - assert not is_new[1] # 10 min < 30 min - assert is_new[2] # 50 min > 30 min - assert not is_new[3] # 5 min < 30 min + session_id = sbd.to_list( + sbd.col(_add_session_id(df, [], "timestamp", 30), "timestamp_session_id") + ) + # Expected: first two events in session 0, last two events in session 1 + assert session_id == [0, 0, 1, 1] def test_check_is_new_session_with_by(df_module): From 868d52956e8932f970d9b69f9e0f3cb226e8b129 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 27 Mar 2026 10:53:18 +0100 Subject: [PATCH 22/74] changelo --- CHANGES.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index ff09e9076..ba5f8aa25 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,6 +11,13 @@ Ongoing Development New Features ------------ +- The :class:`SessionEncoder` is now available. This encoder takes a dataframe with + a timestamp column and computes sessions based on the given session duration. + Additionally, it is possible to provide a ``by`` column or list of columns + (e.g., user ID or (user ID, user device)) to compute sessions for each grouping + value. A new dataset has also been added and can be accessed by using the fetcher + :class:`~skrub.datasets.fetch_wowah`. + :pr:`1930` by :user:`Riccardo Cappuzzo `. Changes ------- @@ -39,12 +46,6 @@ New Features some more attributes for inspection by scikit-learn: ``__sklearn_tags__``, ``classes_``, ``_estimator_type``. :pr:`1931` by :user:`Jérôme Dockès `. -- The :class:`SessionEncoder` is now available. This encoder takes a dataframe with - a timestamp column and computes sessions based on the given session duration. - Additionally, it is possible to provide a ``by`` column or list of columns - (e.g., user ID or (user ID, user device)) to compute sessions for each grouping - value. - :pr:`1930` by :user:`Riccardo Cappuzzo `. - It is now possible to pass additional (dynamically computed) arguments to the cross-validation splitter used by :class:`DataOp` objects for validation, hyperparameter search etc. For example, the groups for a From bd64559a156919f66e17aa767426cab74b763b5b Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 22 Apr 2026 15:50:40 +0200 Subject: [PATCH 23/74] reordering rows after adding session id --- skrub/_session_encoder.py | 10 ++++++++-- skrub/tests/test_session_encoder.py | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 778ad594b..95c00129a 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -338,6 +338,9 @@ def fit_transform(self, X, y=None): if not isinstance(self.session_gap, numbers.Number) or self.session_gap <= 0: raise ValueError("session_gap must be a positive number") + row_order_col = f"_row_order_skrub_{random_string()}" + X = sbd.with_columns(X, **{row_order_col: range(X.shape[0])}) + # sort the input dataframe by the "group_by" and "timestamp" columns sort_by = ( self.group_by_columns + [self.timestamp_col] @@ -356,8 +359,11 @@ def fit_transform(self, X, y=None): to_drop = [col for col in factorized_by if col not in self.group_by_columns] X_with_session_id = sbd.drop_columns(X_with_session_id, to_drop) - self.all_outputs_ = sbd.column_names(X_with_session_id) - return X_with_session_id + X_result = sbd.sort(X_with_session_id, by=row_order_col) + X_result = sbd.drop_columns(X_result, row_order_col) + + self.all_outputs_ = sbd.column_names(X_result) + return X_result def transform(self, X): """Transform the data by encoding sessions. diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index e4852a5f3..a0ac94d64 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -576,6 +576,32 @@ def test_check_is_new_session_with_by(df_module): assert session_ids == [0, 0, 1, 1] +def test_session_encoder_preserves_input_order(df_module): + """Test that the output rows are in the same order as the input rows. + + The encoder sorts internally to detect sessions correctly, but the result + must be returned in the original input order. + """ + # Deliberately unsorted: bob first, then alice, timestamps out of order + timestamps = [ + datetime.datetime(2024, 1, 1, 10, 20), # row 0: bob + datetime.datetime(2024, 1, 1, 10, 0), # row 1: alice + datetime.datetime(2024, 1, 1, 10, 25), # row 2: bob + datetime.datetime(2024, 1, 1, 10, 5), # row 3: alice + ] + user_ids = ["bob", "alice", "bob", "alice"] + + df = df_module.make_dataframe({"timestamp": timestamps, "user_id": user_ids}) + + se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) + result = se.fit_transform(df) + + # The user_id column must still be in the original order + assert sbd.to_list(sbd.col(result, "user_id")) == user_ids + # The timestamp column must still be in the original order + assert sbd.to_list(sbd.col(result, "timestamp")) == timestamps + + @pytest.mark.parametrize( "func", ( From 8ddd65139d2fb3c1225a3cd35b102728e175eeb9 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 22 May 2026 16:27:45 +0200 Subject: [PATCH 24/74] fixing changelog after merge --- CHANGES.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 831f39802..6e8ef8410 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,7 +11,13 @@ Ongoing development New Features ------------ - +- The :class:`SessionEncoder` is now available. This encoder takes a dataframe with + a timestamp column and computes sessions based on the given session duration. + Additionally, it is possible to provide a ``by`` column or list of columns + (e.g., user ID or (user ID, user device)) to compute sessions for each grouping + value. A new dataset has also been added and can be accessed by using the fetcher + :class:`~skrub.datasets.fetch_wowah`. + :pr:`1930` by :user:`Riccardo Cappuzzo `. Changes ------- - An unnecessary warning that was raised when passing a numpy array to the From dcc13699ba19b0bb73f37ac2c8c7842321911c4b Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Sun, 24 May 2026 16:09:50 +0200 Subject: [PATCH 25/74] implementing a fix from review --- skrub/_session_encoder.py | 56 +++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 95c00129a..b9afc6fb8 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -19,6 +19,7 @@ from sklearn.utils.validation import check_is_fitted from . import _dataframe as sbd +from . import selectors as s from ._dispatch import dispatch, raise_dispatch_unregistered_type from ._utils import random_string @@ -113,8 +114,8 @@ class SessionEncoder(TransformerMixin, BaseEstimator): by at most ``session_gap`` minutes. Additionally, it is possible to provide a column or list of columns that identifies the user (specified by the ``group_by`` column). When the time gap between consecutive events exceeds ``session_gap``, or - when the user changes, a new session begins. All unrelated columns are passed - through unchanged. + when what identifies a user changes, a new session begins. All unrelated columns + are passed through unchanged. Parameters ---------- @@ -145,15 +146,14 @@ class SessionEncoder(TransformerMixin, BaseEstimator): Examples -------- Consider this example where we have a dataframe with user events, and we want - to identify sessions based on a 30-minute gap between events for each user: + to identify sessions based on a 30-minute gap between events for each user. + Users are identified by the value of the column ``user_id``. >>> import pandas as pd >>> from datetime import datetime, timedelta >>> encoder = SessionEncoder( ... group_by='user_id', timestamp_col='timestamp', session_gap=30 ... ) - - >>> # Create a sample dataframe with events from different users >>> data = { ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], ... 'timestamp': [ @@ -200,8 +200,6 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... timestamp_col='timestamp', ... session_gap=30 ... ) - - >>> # Create a sample dataframe where user_id + device_id identifies a user >>> data_multi = { ... 'user_id': [1, 1, 1, 1, 2, 2], ... 'device_id': ['mobile', 'mobile', 'desktop', 'desktop', 'mobile', 'mobile'], @@ -233,7 +231,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): - User 1 on "mobile" has session 1 (different device, so separate session). - User 2 on "mobile" has session 2 (different user). - You can also use SessionEncoder without a user identifier column. In this case, + You can also use ``SessionEncoder`` without a user identifier column. In this case, sessions are separated only by time gaps. This is useful for analyzing a single timeseries or events that don't have a user dimension: @@ -242,8 +240,6 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... timestamp_col='timestamp', ... session_gap=30 ... ) - - >>> # Create a sample dataframe with only timestamps >>> data_no_group = { ... 'timestamp': [ ... pd.Timestamp('2024-01-01 10:00:00'), @@ -314,26 +310,27 @@ def fit_transform(self, X, y=None): The transformed dataframe with session information. """ self.all_inputs_ = sbd.column_names(X) + + # Check that the timestamp column is present + if self.timestamp_col not in self.all_inputs_: + raise ValueError( + f"Column '{self.timestamp_col}' not found in input dataframe" + ) # check that the required columns are present in the input dataframe + self._group_by_columns = [] if self.group_by is not None: if isinstance(self.group_by, str): - self.group_by_columns = [self.group_by] + self._group_by_columns = [self.group_by] elif isinstance(self.group_by, Iterable) and not isinstance( self.group_by, str ): - self.group_by_columns = list(self.group_by) + self._group_by_columns = list(self.group_by) else: raise TypeError("group_by must be a string, a list of strings, or None") - if self.group_by is not None: - for col in self.group_by_columns: + for col in self._group_by_columns: if col not in self.all_inputs_: raise ValueError(f"Column '{col}' not found in input dataframe") - if self.timestamp_col not in self.all_inputs_: - raise ValueError( - f"Column '{self.timestamp_col}' not found in input dataframe" - ) - # check the correctness of the values of session_gap if not isinstance(self.session_gap, numbers.Number) or self.session_gap <= 0: raise ValueError("session_gap must be a positive number") @@ -341,13 +338,24 @@ def fit_transform(self, X, y=None): row_order_col = f"_row_order_skrub_{random_string()}" X = sbd.with_columns(X, **{row_order_col: range(X.shape[0])}) + # Removing unrelated columns for computing the sessions + cols_to_remove = [ + _ + for _ in self.all_inputs_ + if _ not in self._group_by_columns + [self.timestamp_col] + ] + if cols_to_remove: + X_selected = sbd.drop_columns(X, s.cols(*cols_to_remove).expand(X)) + else: + X_selected = X + # sort the input dataframe by the "group_by" and "timestamp" columns sort_by = ( - self.group_by_columns + [self.timestamp_col] + self._group_by_columns + [self.timestamp_col] if self.group_by is not None else [self.timestamp_col] ) - X_sorted = sbd.sort(X, by=sort_by) + X_sorted = sbd.sort(X_selected, by=sort_by) X_factorized, factorized_by = self._factorize_columns(X_sorted) # add the session id @@ -356,11 +364,13 @@ def fit_transform(self, X, y=None): ) # drop the factorized "group_by" column if the original "group_by" # column was not numeric - to_drop = [col for col in factorized_by if col not in self.group_by_columns] + to_drop = [col for col in factorized_by if col not in self._group_by_columns] X_with_session_id = sbd.drop_columns(X_with_session_id, to_drop) X_result = sbd.sort(X_with_session_id, by=row_order_col) X_result = sbd.drop_columns(X_result, row_order_col) + if cols_to_remove: + X_result = sbd.concat(X_result, s.select(X, cols_to_remove), axis=1) self.all_outputs_ = sbd.column_names(X_result) return X_result @@ -390,7 +400,7 @@ def _factorize_columns(self, X): f"{col}_factorized_skrub_{random_string()}": _factorize_column(X, col) if not sbd.is_numeric(X[col]) else X[col] - for col in self.group_by_columns + for col in self._group_by_columns } X_factorized = sbd.with_columns(X, **factorized_columns) From 87371e6995e84db58bdbdd6b9da054affb40df06 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 26 May 2026 10:52:50 +0200 Subject: [PATCH 26/74] reordering columns so that the session id is added as last col --- skrub/_session_encoder.py | 133 ++++++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 48 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index b9afc6fb8..307695ce7 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -30,12 +30,12 @@ @dispatch -def _add_session_id(X, group_by, timestamp_col, session_gap): +def _add_session_column(X, group_by, timestamp_col, session_gap, suffix): raise_dispatch_unregistered_type(X, kind="Dataframe") -@_add_session_id.specialize("pandas") -def _add_session_id_pandas(X, group_by, timestamp_col, session_gap): +@_add_session_column.specialize("pandas") +def _add_session_column_pandas(X, group_by, timestamp_col, session_gap, suffix): # pandas 3.0 changed the resolution of astype(int) for datetime columns from # nanoseconds to milliseconds, so we need to adjust the time difference calculation # accordingly @@ -59,13 +59,13 @@ def _add_session_id_pandas(X, group_by, timestamp_col, session_gap): else: is_new_session = time_diff # Compute cumulative sum of is_new_session to create session IDs - column_name = f"{timestamp_col}_session_id" + column_name = f"{timestamp_col}_{suffix}" X[column_name] = is_new_session.cumsum() return X -@_add_session_id.specialize("polars") -def _add_session_id_polars(X, group_by, timestamp_col, session_gap): +@_add_session_column.specialize("polars") +def _add_session_column_polars(X, group_by, timestamp_col, session_gap, suffix): # check if the time difference between events exceeds the session gap time_diff = ( X[timestamp_col].dt.epoch("ms").diff().fill_null(0) > session_gap * 60 * 1000 @@ -81,12 +81,14 @@ def _add_session_id_polars(X, group_by, timestamp_col, session_gap): else: is_new_session = time_diff # Add session_id by computing cumulative sum of is_new_session - column_name = f"{timestamp_col}_session_id" + column_name = f"{timestamp_col}_{suffix}" return X.with_columns(is_new_session.cum_sum().alias(column_name)) @dispatch def _factorize_column(X, column_name): + # Factorization is done so different groups can be found by doing a simple + # numeric difference raise_dispatch_unregistered_type(X, kind="Dataframe") @@ -150,6 +152,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): Users are identified by the value of the column ``user_id``. >>> import pandas as pd + >>> from skrub import SessionEncoder >>> from datetime import datetime, timedelta >>> encoder = SessionEncoder( ... group_by='user_id', timestamp_col='timestamp', session_gap=30 @@ -193,7 +196,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): 30-minute window. You can also identify users by multiple columns. For instance, the same user - on different devices should have separate sessions: + on different devices should have separate sessions. >>> encoder_multi = SessionEncoder( ... group_by=['user_id', 'device_id'], @@ -217,13 +220,13 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> df_multi = pd.DataFrame(data_multi) >>> result_multi = encoder_multi.fit_transform(df_multi) >>> result_multi - user_id device_id timestamp action timestamp_session_id - 0 1 desktop 2024-01-01 10:05:00 view 0 - 1 1 desktop 2024-01-01 10:20:00 checkout 0 - 2 1 mobile 2024-01-01 10:00:00 view 1 - 3 1 mobile 2024-01-01 10:10:00 purchase 1 - 4 2 mobile 2024-01-01 10:00:00 login 2 - 5 2 mobile 2024-01-01 10:15:00 view 2 + user_id device_id timestamp action timestamp_session_id + 0 1 mobile 2024-01-01 10:00:00 view 1 + 1 1 mobile 2024-01-01 10:10:00 purchase 1 + 2 1 desktop 2024-01-01 10:05:00 view 0 + 3 1 desktop 2024-01-01 10:20:00 checkout 0 + 4 2 mobile 2024-01-01 10:00:00 login 2 + 5 2 mobile 2024-01-01 10:15:00 view 2 In this example: @@ -231,6 +234,11 @@ class SessionEncoder(TransformerMixin, BaseEstimator): - User 1 on "mobile" has session 1 (different device, so separate session). - User 2 on "mobile" has session 2 (different user). + Note that sessions are defined by sorting over the grouping columns and then + by the timestamp: this is why, while the "desktop" + session of User 1 starts after their "mobile" session, it has session id ``0`` + since in alphabetical ordering "desktop" is first. + You can also use ``SessionEncoder`` without a user identifier column. In this case, sessions are separated only by time gaps. This is useful for analyzing a single timeseries or events that don't have a user dimension: @@ -269,10 +277,13 @@ class SessionEncoder(TransformerMixin, BaseEstimator): """ - def __init__(self, timestamp_col, group_by=None, session_gap=30): + def __init__( + self, timestamp_col, group_by=None, session_gap=30, suffix="session_id" + ): self.timestamp_col = timestamp_col self.group_by = group_by self.session_gap = session_gap + self.suffix = suffix def fit(self, X, y=None): """Fit the transformer to the data. @@ -311,40 +322,28 @@ def fit_transform(self, X, y=None): """ self.all_inputs_ = sbd.column_names(X) - # Check that the timestamp column is present - if self.timestamp_col not in self.all_inputs_: + # Checking that all the needed columns are there + self._check_input_dataframe() + # check the correctness of the values of session_gap + if not isinstance(self.session_gap, numbers.Number): + raise TypeError(f"Expected a number, got {type(self.session_gap)}") + if self.session_gap <= 0: raise ValueError( - f"Column '{self.timestamp_col}' not found in input dataframe" + f"session_gap must be a positive number, got {self.session_gap}" ) - # check that the required columns are present in the input dataframe - self._group_by_columns = [] - if self.group_by is not None: - if isinstance(self.group_by, str): - self._group_by_columns = [self.group_by] - elif isinstance(self.group_by, Iterable) and not isinstance( - self.group_by, str - ): - self._group_by_columns = list(self.group_by) - else: - raise TypeError("group_by must be a string, a list of strings, or None") - for col in self._group_by_columns: - if col not in self.all_inputs_: - raise ValueError(f"Column '{col}' not found in input dataframe") - # check the correctness of the values of session_gap - if not isinstance(self.session_gap, numbers.Number) or self.session_gap <= 0: - raise ValueError("session_gap must be a positive number") + self._session_id_name = f"{self.timestamp_col}_{self.suffix}" + # Adding a row order column to sort lines back row_order_col = f"_row_order_skrub_{random_string()}" X = sbd.with_columns(X, **{row_order_col: range(X.shape[0])}) # Removing unrelated columns for computing the sessions - cols_to_remove = [ + if cols_to_remove := [ _ for _ in self.all_inputs_ if _ not in self._group_by_columns + [self.timestamp_col] - ] - if cols_to_remove: + ]: X_selected = sbd.drop_columns(X, s.cols(*cols_to_remove).expand(X)) else: X_selected = X @@ -359,18 +358,25 @@ def fit_transform(self, X, y=None): X_factorized, factorized_by = self._factorize_columns(X_sorted) # add the session id - X_with_session_id = _add_session_id( - X_factorized, factorized_by, self.timestamp_col, self.session_gap + + X_with_session_id = self._add_session_id( + X_factorized, + factorized_by, ) - # drop the factorized "group_by" column if the original "group_by" - # column was not numeric + # Reordering rows back to the original order + X_result = sbd.sort(X_with_session_id, by=row_order_col) + + # drop the factorized "group_by" columns if the original "group_by" + # columns were not numeric, and the column used to reorder to_drop = [col for col in factorized_by if col not in self._group_by_columns] - X_with_session_id = sbd.drop_columns(X_with_session_id, to_drop) + to_drop += [row_order_col] + X_result = sbd.drop_columns(X_result, to_drop) - X_result = sbd.sort(X_with_session_id, by=row_order_col) - X_result = sbd.drop_columns(X_result, row_order_col) + # If unrelated columns were removed earlier, bring them back here if cols_to_remove: X_result = sbd.concat(X_result, s.select(X, cols_to_remove), axis=1) + # proper_column_order = self.all_inputs_ + [self._session_id_name] + X_result = s.select(X_result, self.all_inputs_ + [self._session_id_name]) self.all_outputs_ = sbd.column_names(X_result) return X_result @@ -391,9 +397,30 @@ def transform(self, X): check_is_fitted(self) return self.fit_transform(X) + def _check_input_dataframe(self): + # Check that the timestamp column is present + if self.timestamp_col not in self.all_inputs_: + raise ValueError( + f"Column '{self.timestamp_col}' not found in input dataframe" + ) + # check that the required columns are present in the input dataframe + self._group_by_columns = [] + if self.group_by is not None: + if isinstance(self.group_by, str): + self._group_by_columns = [self.group_by] + elif isinstance(self.group_by, Iterable) and not isinstance( + self.group_by, str + ): + self._group_by_columns = list(self.group_by) + else: + raise TypeError("group_by must be a string, a list of strings, or None") + for col in self._group_by_columns: + if col not in self.all_inputs_: + raise ValueError(f"Column '{col}' not found in input dataframe") + def _factorize_columns(self, X): - # convert group_by column to string if it's not already, to ensure - # that the diff operation works correctly + # convert group_by columns to numerical columns if they're not already, to + # ensure that the diff operation works correctly if not self.group_by: return X, [] factorized_columns = { @@ -407,6 +434,16 @@ def _factorize_columns(self, X): return X_factorized, list(factorized_columns.keys()) + def _add_session_id(self, X_factorized, factorized_by): + X_with_session_id = _add_session_column( + X_factorized, + factorized_by, + self.timestamp_col, + self.session_gap, + self.suffix, + ) + return X_with_session_id + def get_feature_names_out(self, input_features=None): """Return the column names of the output of ``transform`` as a list of strings. From b336d9bb1f073aea00f5939ebf5dfa74db50d7d3 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 26 May 2026 11:00:26 +0200 Subject: [PATCH 27/74] more fixes --- skrub/_session_encoder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 307695ce7..6a42da42f 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -338,7 +338,7 @@ def fit_transform(self, X, y=None): row_order_col = f"_row_order_skrub_{random_string()}" X = sbd.with_columns(X, **{row_order_col: range(X.shape[0])}) - # Removing unrelated columns for computing the sessions + # Dropping unneeded columns to reduce the sorting overhead if cols_to_remove := [ _ for _ in self.all_inputs_ @@ -357,7 +357,6 @@ def fit_transform(self, X, y=None): X_sorted = sbd.sort(X_selected, by=sort_by) X_factorized, factorized_by = self._factorize_columns(X_sorted) - # add the session id X_with_session_id = self._add_session_id( X_factorized, @@ -375,8 +374,9 @@ def fit_transform(self, X, y=None): # If unrelated columns were removed earlier, bring them back here if cols_to_remove: X_result = sbd.concat(X_result, s.select(X, cols_to_remove), axis=1) - # proper_column_order = self.all_inputs_ + [self._session_id_name] - X_result = s.select(X_result, self.all_inputs_ + [self._session_id_name]) + + # Reordering columns so that the session_id is added as the last column + X_result = s.select(X_result, self.all_inputs_ + [self._session_id_name]) self.all_outputs_ = sbd.column_names(X_result) return X_result From 9fbe79da14d9136e5f1203e4cfa9552de73cc229 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 26 May 2026 11:32:30 +0200 Subject: [PATCH 28/74] _ --- skrub/_session_encoder.py | 2 + skrub/tests/test_session_encoder.py | 70 ++++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 6a42da42f..14996eeef 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -331,6 +331,8 @@ def fit_transform(self, X, y=None): raise ValueError( f"session_gap must be a positive number, got {self.session_gap}" ) + if not isinstance(self.suffix, str) or self.suffix is None: + raise ValueError(f"Expected a string as suffix, got {self.suffix!r}") self._session_id_name = f"{self.timestamp_col}_{self.suffix}" diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index a0ac94d64..29a1e477d 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -7,7 +7,7 @@ from skrub import SessionEncoder from skrub import _dataframe as sbd from skrub._session_encoder import ( - _add_session_id, + _add_session_column, _factorize_column, ) @@ -412,6 +412,13 @@ def test_session_encoder_invalid_parameters(df_module): } ) + # Test non-numeric session_gap + se_non_numeric = SessionEncoder( + group_by="user_id", timestamp_col="timestamp", session_gap="thirty" + ) + with pytest.raises(TypeError, match="Expected a number"): + se_non_numeric.fit_transform(df) + # Test negative session_gap se_negative = SessionEncoder( group_by="user_id", timestamp_col="timestamp", session_gap=-10 @@ -426,12 +433,12 @@ def test_session_encoder_invalid_parameters(df_module): with pytest.raises(ValueError, match="session_gap must be a positive number"): se_zero.fit_transform(df) - # Test non-numeric session_gap - se_non_numeric = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap="thirty" + # Test invalid suffix (None) + se_invalid_suffix = SessionEncoder( + group_by="user_id", timestamp_col="timestamp", suffix=None ) - with pytest.raises(ValueError, match="session_gap must be a positive number"): - se_non_numeric.fit_transform(df) + with pytest.raises(ValueError, match="Expected a string as suffix"): + se_invalid_suffix.fit_transform(df) def test_session_encoder_preserves_columns(df_module): @@ -538,14 +545,17 @@ def test_check_is_new_session_no_by(df_module): } ) session_id = sbd.to_list( - sbd.col(_add_session_id(df, [], "timestamp", 30), "timestamp_session_id") + sbd.col( + _add_session_column(df, [], "timestamp", 30, suffix="session_id"), + "timestamp_session_id", + ) ) # Expected: first two events in session 0, last two events in session 1 assert session_id == [0, 0, 1, 1] def test_check_is_new_session_with_by(df_module): - """_add_session_id returns a dataframe with a ``timestamp_session_id`` + """_add_session_column returns a dataframe with a ``timestamp_session_id`` column when a group_by-list is provided. A new session starts when the group key changes (even for a tiny time gap) or when the time gap exceeds ``session_gap``. @@ -568,14 +578,48 @@ def test_check_is_new_session_with_by(df_module): ], } ) - result = _add_session_id(df, ["user_id"], "timestamp", 30) + result = _add_session_column(df, ["user_id"], "timestamp", 30, "session_id") - # _add_session_id now returns the full dataframe with session_id added + # _add_session_column now returns the full dataframe with session_id added assert "timestamp_session_id" in sbd.column_names(result) session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) assert session_ids == [0, 0, 1, 1] +@pytest.mark.parametrize( + "timestamp", + ["timestamp", "something_else"], +) +@pytest.mark.parametrize( + "suffix", + [None, "session_id", "test_suffix"], +) +def test_proper_suffix(timestamp, suffix, df_module): + df = df_module.make_dataframe( + { + "user_id": [1, 1, 2, 2], + timestamp: [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 5), # same user, 5 min gap + datetime.datetime(2024, 1, 1, 10, 6), # different user, 1 min gap + datetime.datetime(2024, 1, 1, 10, 10), # same user, 4 min gap + ], + } + ) + if suffix is None: + with pytest.raises(ValueError, match="Expected a string as suffix*"): + SessionEncoder( + timestamp_col=timestamp, group_by="user_id", suffix=suffix + ).fit_transform(df) + else: + result = SessionEncoder( + timestamp_col=timestamp, group_by="user_id", suffix=suffix + ).fit_transform(df) + # _add_session_column now returns the full dataframe with session_id added + expected_name = f"{timestamp}_{suffix}" + assert expected_name in sbd.column_names(result) + + def test_session_encoder_preserves_input_order(df_module): """Test that the output rows are in the same order as the input rows. @@ -606,7 +650,11 @@ def test_session_encoder_preserves_input_order(df_module): "func", ( partial( - _add_session_id, group_by=[], timestamp_col="timestamp", session_gap=30 + _add_session_column, + group_by=[], + timestamp_col="timestamp", + session_gap=30, + suffix="_session_id", ), partial(_factorize_column, column_name="user_id"), ), From 16801626cede2ac054bb6f12c0db357ebe8ebc7d Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Wed, 27 May 2026 11:53:50 +0200 Subject: [PATCH 29/74] example --- skrub/_session_encoder.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 14996eeef..32762127e 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -136,6 +136,10 @@ class SessionEncoder(TransformerMixin, BaseEstimator): between two events exceeds this value, they are considered to be in different sessions. + suffix : str, default="session_id" + The suffix to be added to the name of the timestamp column. The format + will be "TIMESTAMP_SUFFIX". + Attributes ---------- all_inputs_ : list of str @@ -274,6 +278,29 @@ class SessionEncoder(TransformerMixin, BaseEstimator): - The event at 11:00 starts a new session 1 (45 min gap > 30 min). - The event at 11:10 continues session 1 (10 min gap < 30 min). + It is also possible to change the suffix that is added at the end of the session + ID column via the "suffix" parameter. This is useful, for example, if you want + to add sessions based on different groupings or intervals: + + >>> import pandas as pd + >>> from skrub import SessionEncoder + >>> from datetime import datetime, timedelta + >>> encoder = SessionEncoder( + ... group_by='user_id', timestamp_col='timestamp', session_gap=30 + ... ) + >>> data = { + ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], + ... 'timestamp': [ + ... pd.Timestamp('2024-01-01 10:00:00'), + ... pd.Timestamp('2024-01-01 10:05:00'), # 5 min later, same session + ... pd.Timestamp('2024-01-01 11:00:00'), # 55 min later, new session + ... pd.Timestamp('2024-01-01 10:00:00'), # Different user + ... pd.Timestamp('2024-01-01 10:20:00'), # 20 min later, same session + ... ], + ... 'action': ['login', 'view', 'purchase', 'login', 'purchase'] + ... } + >>> df = pd.DataFrame(data) + >>> encoder_user = SessionEncoder(group_) """ From 091a66cbfef238d81acd9c79c1b4401e15fcc7f6 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 28 May 2026 11:23:30 +0200 Subject: [PATCH 30/74] docstrings --- CHANGES.rst | 3 +-- skrub/_session_encoder.py | 54 ++++++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 6e8ef8410..ac45ab97d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,8 +15,7 @@ New Features a timestamp column and computes sessions based on the given session duration. Additionally, it is possible to provide a ``by`` column or list of columns (e.g., user ID or (user ID, user device)) to compute sessions for each grouping - value. A new dataset has also been added and can be accessed by using the fetcher - :class:`~skrub.datasets.fetch_wowah`. + value. A new synthetic dataset generator has also been added. :pr:`1930` by :user:`Riccardo Cappuzzo `. Changes ------- diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 32762127e..a0b7508cc 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -131,10 +131,10 @@ class SessionEncoder(TransformerMixin, BaseEstimator): provided, sessions are detected based on the time gap between events, and all events are considered to belong to the same user (or group). - session_gap : int, default=30 - The maximum gap (in minutes) between events in a session. If the gap + session_gap : int, default=1800 + The maximum gap (in seconds) between events in a session. If the gap between two events exceeds this value, they are considered to be in - different sessions. + different sessions. Default is 1800 seconds (30 minutes). suffix : str, default="session_id" The suffix to be added to the name of the timestamp column. The format @@ -282,30 +282,48 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ID column via the "suffix" parameter. This is useful, for example, if you want to add sessions based on different groupings or intervals: - >>> import pandas as pd - >>> from skrub import SessionEncoder - >>> from datetime import datetime, timedelta - >>> encoder = SessionEncoder( - ... group_by='user_id', timestamp_col='timestamp', session_gap=30 - ... ) - >>> data = { - ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], + >>> data_multi = { + ... 'user_id': [1, 1, 1, 1, 2, 2], + ... 'device_id': ['mobile', 'mobile', 'desktop', 'desktop', 'mobile', 'mobile'], ... 'timestamp': [ ... pd.Timestamp('2024-01-01 10:00:00'), - ... pd.Timestamp('2024-01-01 10:05:00'), # 5 min later, same session - ... pd.Timestamp('2024-01-01 11:00:00'), # 55 min later, new session + ... pd.Timestamp('2024-01-01 10:10:00'), # 10 min later, same session + ... pd.Timestamp('2024-01-01 10:05:00'), # Different device (sorted), + ... # different session + ... pd.Timestamp('2024-01-01 10:20:00'), # 15 min later, same session ... pd.Timestamp('2024-01-01 10:00:00'), # Different user - ... pd.Timestamp('2024-01-01 10:20:00'), # 20 min later, same session + ... pd.Timestamp('2024-01-01 10:15:00'), # 15 min later, same session ... ], - ... 'action': ['login', 'view', 'purchase', 'login', 'purchase'] + ... 'action': ['view', 'purchase', 'view', 'checkout', 'login', 'view'] ... } - >>> df = pd.DataFrame(data) - >>> encoder_user = SessionEncoder(group_) + >>> df = pd.DataFrame(data_multi) + >>> encoder_user = SessionEncoder("timestamp", + ... group_by=["user_id"], suffix="user") + >>> encoder_user.fit_transform(df) + user_id device_id timestamp action timestamp_user + 0 1 mobile 2024-01-01 10:00:00 view 0 + 1 1 mobile 2024-01-01 10:10:00 purchase 0 + 2 1 desktop 2024-01-01 10:05:00 view 0 + 3 1 desktop 2024-01-01 10:20:00 checkout 0 + 4 2 mobile 2024-01-01 10:00:00 login 1 + 5 2 mobile 2024-01-01 10:15:00 view 1 + + >>> encoder_user_device = SessionEncoder("timestamp", + ... group_by=["user_id", "device_id"], + ... suffix="user_device") + >>> encoder_user_device.fit_transform(df) + user_id device_id timestamp action timestamp_user_device + 0 1 mobile 2024-01-01 10:00:00 view 1 + 1 1 mobile 2024-01-01 10:10:00 purchase 1 + 2 1 desktop 2024-01-01 10:05:00 view 0 + 3 1 desktop 2024-01-01 10:20:00 checkout 0 + 4 2 mobile 2024-01-01 10:00:00 login 2 + 5 2 mobile 2024-01-01 10:15:00 view 2 """ def __init__( - self, timestamp_col, group_by=None, session_gap=30, suffix="session_id" + self, timestamp_col, group_by=None, session_gap=30 * 60, suffix="session_id" ): self.timestamp_col = timestamp_col self.group_by = group_by From 8248f5010f1407162313f0df229becd8cad45af6 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 28 May 2026 11:33:39 +0200 Subject: [PATCH 31/74] changing to seconds --- skrub/_session_encoder.py | 38 +++++++++++++++-------------- skrub/tests/test_session_encoder.py | 26 +++++++++++++------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index a0b7508cc..829e8a98e 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -2,8 +2,8 @@ The SessionEncoder is a transformer that takes as input: - a "timestamp" column, which identifies the time of an event - a "by" column or list of columns, which identifies a user -- a "session_gap" value, which identifies the maximum allowed gap between events -in a session +- a "session_gap" value, which identifies the maximum allowed gap in seconds +between events in a session It returns a dataframe with the same number of rows as the input, but with the column "session_id": a unique identifier for each session, which is a combination @@ -42,13 +42,11 @@ def _add_session_column_pandas(X, group_by, timestamp_col, session_gap, suffix): if parse(pd.__version__).major <= 2: # check if the time difference between events exceeds the session gap time_diff = ( - X[timestamp_col].astype(int).diff().fillna(0) // 10**6 - > session_gap * 60 * 1000 + X[timestamp_col].astype(int).diff().fillna(0) // 10**6 > session_gap * 1000 ) else: time_diff = ( - X[timestamp_col].astype(int).diff().fillna(0) // 10**3 - > session_gap * 60 * 1000 + X[timestamp_col].astype(int).diff().fillna(0) // 10**3 > session_gap * 1000 ) if group_by: # check if the "group_by" column changes @@ -67,9 +65,7 @@ def _add_session_column_pandas(X, group_by, timestamp_col, session_gap, suffix): @_add_session_column.specialize("polars") def _add_session_column_polars(X, group_by, timestamp_col, session_gap, suffix): # check if the time difference between events exceeds the session gap - time_diff = ( - X[timestamp_col].dt.epoch("ms").diff().fill_null(0) > session_gap * 60 * 1000 - ) + time_diff = X[timestamp_col].dt.epoch("ms").diff().fill_null(0) > session_gap * 1000 if group_by: # check if the "group_by" column changes group_diff = X.select( @@ -113,7 +109,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): """Encode sessions from a dataframe. A session is defined as a sequence of events where consecutive events are separated - by at most ``session_gap`` minutes. Additionally, it is possible to provide a column + by at most ``session_gap`` seconds. Additionally, it is possible to provide a column or list of columns that identifies the user (specified by the ``group_by`` column). When the time gap between consecutive events exceeds ``session_gap``, or when what identifies a user changes, a new session begins. All unrelated columns @@ -137,8 +133,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): different sessions. Default is 1800 seconds (30 minutes). suffix : str, default="session_id" - The suffix to be added to the name of the timestamp column. The format - will be "TIMESTAMP_SUFFIX". + The suffix to be added to the name of the timestamp column. Attributes ---------- @@ -147,7 +142,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): all_outputs_: list of str All column names in the input dataframe plus the new column that identifies - the session, with name "{timestamp}_session_id". + the session, with name "{timestamp}_{suffix}". Examples -------- @@ -159,7 +154,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> from skrub import SessionEncoder >>> from datetime import datetime, timedelta >>> encoder = SessionEncoder( - ... group_by='user_id', timestamp_col='timestamp', session_gap=30 + ... group_by='user_id', timestamp_col='timestamp', session_gap=30 * 60 ... ) >>> data = { ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], @@ -205,7 +200,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> encoder_multi = SessionEncoder( ... group_by=['user_id', 'device_id'], ... timestamp_col='timestamp', - ... session_gap=30 + ... session_gap=30 * 60 ... ) >>> data_multi = { ... 'user_id': [1, 1, 1, 1, 2, 2], @@ -250,7 +245,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> encoder_no_group = SessionEncoder( ... group_by=None, ... timestamp_col='timestamp', - ... session_gap=30 + ... session_gap=30 * 60 ... ) >>> data_no_group = { ... 'timestamp': [ @@ -445,6 +440,10 @@ def transform(self, X): return self.fit_transform(X) def _check_input_dataframe(self): + """ + Check that the input columns are present and correct + """ + # Check that the timestamp column is present if self.timestamp_col not in self.all_inputs_: raise ValueError( @@ -466,8 +465,11 @@ def _check_input_dataframe(self): raise ValueError(f"Column '{col}' not found in input dataframe") def _factorize_columns(self, X): - # convert group_by columns to numerical columns if they're not already, to - # ensure that the diff operation works correctly + """ + convert group_by columns to numerical columns if they're not already, to + ensure that the diff operation works correctly + """ + if not self.group_by: return X, [] factorized_columns = { diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 29a1e477d..eb994739a 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -112,7 +112,9 @@ def test_session_encoder_basic( ): """Test basic sessionization grouping by user_id or username.""" # Apply SessionEncoder grouping by the specified column - se = SessionEncoder(group_by=by_column, timestamp_col="timestamp", session_gap=30) + se = SessionEncoder( + group_by=by_column, timestamp_col="timestamp", session_gap=30 * 60 + ) result = se.fit_transform(example_session_data) # Check that we have the expected total number of sessions @@ -146,7 +148,9 @@ def test_session_encoder_different_users_different_sessions( ): """Test that different users/groups have different session IDs.""" # Apply SessionEncoder - se = SessionEncoder(group_by=by_column, timestamp_col="timestamp", session_gap=30) + se = SessionEncoder( + group_by=by_column, timestamp_col="timestamp", session_gap=30 * 60 + ) result = se.fit_transform(example_session_data) # content of the "session_id" column after sessionization @@ -183,7 +187,9 @@ def test_session_encoder_multi_by_columns(example_session_data_multi_by): Total: 4 sessions """ se = SessionEncoder( - group_by=["user_id", "device_id"], timestamp_col="timestamp", session_gap=30 + group_by=["user_id", "device_id"], + timestamp_col="timestamp", + session_gap=30 * 60, ) result = se.fit_transform(example_session_data_multi_by) @@ -235,7 +241,9 @@ def test_session_encoder_multiple_users(df_module): } ) - se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) + se = SessionEncoder( + group_by="user_id", timestamp_col="timestamp", session_gap=30 * 60 + ) result = se.fit_transform(df) # After sorting by user_id and timestamp, each user should have 1 session @@ -266,7 +274,7 @@ def test_session_encoder_time_gap_threshold(df_module): # With 20-minute gap: should create 2 sessions (split at 35-min gap) se_20 = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap=20 + group_by="user_id", timestamp_col="timestamp", session_gap=20 * 60 ) result_20 = se_20.fit_transform(df) session_ids_20 = sbd.to_list(sbd.col(result_20, "timestamp_session_id")) @@ -274,7 +282,7 @@ def test_session_encoder_time_gap_threshold(df_module): # With 40-minute gap: should create 1 session (all gaps < 40 min) se_40 = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap=40 + group_by="user_id", timestamp_col="timestamp", session_gap=40 * 60 ) result_40 = se_40.fit_transform(df) session_ids_40 = sbd.to_list(sbd.col(result_40, "timestamp_session_id")) @@ -302,7 +310,7 @@ def test_session_encoder_no_user_column(df_module): ) # Without 'group_by', sessions are separated only by time gaps - se = SessionEncoder(group_by=None, timestamp_col="timestamp", session_gap=30) + se = SessionEncoder(group_by=None, timestamp_col="timestamp", session_gap=30 * 60) result = se.fit_transform(df) session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) @@ -546,7 +554,7 @@ def test_check_is_new_session_no_by(df_module): ) session_id = sbd.to_list( sbd.col( - _add_session_column(df, [], "timestamp", 30, suffix="session_id"), + _add_session_column(df, [], "timestamp", 30 * 60, suffix="session_id"), "timestamp_session_id", ) ) @@ -578,7 +586,7 @@ def test_check_is_new_session_with_by(df_module): ], } ) - result = _add_session_column(df, ["user_id"], "timestamp", 30, "session_id") + result = _add_session_column(df, ["user_id"], "timestamp", 30 * 60, "session_id") # _add_session_column now returns the full dataframe with session_id added assert "timestamp_session_id" in sbd.column_names(result) From 6c2a92c361155362074ea3f1eebb8b1137413bdf Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 28 May 2026 13:18:35 +0200 Subject: [PATCH 32/74] more tests --- skrub/_dataframe/_common.py | 16 +++++++++++ skrub/_dataframe/tests/test_common.py | 8 ++++++ skrub/_session_encoder.py | 7 +++++ skrub/tests/test_session_encoder.py | 40 +++++++++++++++++++++++++++ 4 files changed, 71 insertions(+) diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py index 495fdb640..684ffdf4b 100644 --- a/skrub/_dataframe/_common.py +++ b/skrub/_dataframe/_common.py @@ -78,6 +78,7 @@ "is_categorical", "to_categorical", "is_all_null", + "is_empty_frame", # # Inspecting, selecting and modifying values # @@ -1012,6 +1013,21 @@ def _is_all_null_polars(col): return all(is_null(col)) +@dispatch +def is_empty_frame(obj): + raise_dispatch_unregistered_type(obj, kind="object") + + +@is_empty_frame.specialize("pandas", argument_type="DataFrame") +def _is_empty_frame_pandas(obj): + return obj.empty + + +@is_empty_frame.specialize("polars", argument_type="DataFrame") +def _is_empty_frame_polars(obj): + return obj.is_empty() + + # # Inspecting, selecting and modifying values # ========================================== diff --git a/skrub/_dataframe/tests/test_common.py b/skrub/_dataframe/tests/test_common.py index f06fd382c..122cb8bd9 100644 --- a/skrub/_dataframe/tests/test_common.py +++ b/skrub/_dataframe/tests/test_common.py @@ -687,6 +687,14 @@ def test_is_all_null_polars(pl_module): assert ns.is_all_null(col) +def test_is_empty_frame(df_module): + empty_frame = df_module.make_dataframe({"a": []}) + not_empty_frame = df_module.make_dataframe({"a": [1]}) + + assert ns.is_empty_frame(empty_frame) + assert not ns.is_empty_frame(not_empty_frame) + + # Inspecting, selecting and modifying values # ========================================== # diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 829e8a98e..23834236c 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -13,6 +13,7 @@ import numbers from collections.abc import Iterable +import numpy as np import pandas as pd from packaging.version import parse from sklearn.base import BaseEstimator, TransformerMixin @@ -376,6 +377,12 @@ def fit_transform(self, X, y=None): self._session_id_name = f"{self.timestamp_col}_{self.suffix}" + if sbd.is_empty_frame(X): + X = sbd.with_columns( + X, **{self._session_id_name: np.array([], dtype=np.float32)} + ) + return X + # Adding a row order column to sort lines back row_order_col = f"_row_order_skrub_{random_string()}" X = sbd.with_columns(X, **{row_order_col: range(X.shape[0])}) diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index eb994739a..77d183cb5 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -2,7 +2,9 @@ from functools import partial import numpy as np +import pandas as pd import pytest +from packaging.version import parse from skrub import SessionEncoder from skrub import _dataframe as sbd @@ -135,6 +137,13 @@ def test_session_encoder_basic( for group_key, sessions in counted_sessions.items(): assert len(sessions) == group_key_to_sessions[group_key] + # Checking that fit then transform still works + result_fit = se.fit(example_session_data).transform(example_session_data) + # content of the "session_id" column after sessionization + session_ids_fit = sbd.to_list(sbd.col(result_fit, "timestamp_session_id")) + + assert session_ids == session_ids_fit + @pytest.mark.parametrize( "by_column,group_keys", @@ -562,6 +571,29 @@ def test_check_is_new_session_no_by(df_module): assert session_id == [0, 0, 1, 1] +@pytest.mark.skipif(parse(pd.__version__).major >= 3, reason="Test only for pandas < 3") +def test_add_session_column_old_pandas(df_module): + """Old versions of pandas have a different branch that needs to be covered""" + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 10), # 10 min — within gap + datetime.datetime(2024, 1, 1, 11, 0), # 50 min — exceeds gap + datetime.datetime(2024, 1, 1, 11, 5), # 5 min — within gap + ] + } + ) + session_id = sbd.to_list( + sbd.col( + _add_session_column(df, [], "timestamp", 30 * 60, suffix="session_id"), + "timestamp_session_id", + ) + ) + # Expected: first two events in session 0, last two events in session 1 + assert session_id == [0, 0, 1, 1] + + def test_check_is_new_session_with_by(df_module): """_add_session_column returns a dataframe with a ``timestamp_session_id`` column when a group_by-list is provided. A new session starts when the group key @@ -670,3 +702,11 @@ def test_session_encoder_preserves_input_order(df_module): def test_error_dispatch(func): with pytest.raises(TypeError, match="Expecting a Pandas or Polars Dataframe"): func(np.array([1])) + + +def test_empty_frame(df_module): + empty_df = df_module.make_dataframe({"timestamp": []}) + encoder = SessionEncoder("timestamp") + result = encoder.fit_transform(empty_df) + + assert sbd.column_names(result) == ["timestamp", "timestamp_session_id"] From 8c6d6a3395ab8311a57c1372a95861f755be631b Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 28 May 2026 13:39:41 +0200 Subject: [PATCH 33/74] ensuring that columns do not get overwritten --- skrub/_session_encoder.py | 21 +++++++++----- skrub/tests/test_session_encoder.py | 45 +++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 23834236c..0ac2d6ae4 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -31,12 +31,14 @@ @dispatch -def _add_session_column(X, group_by, timestamp_col, session_gap, suffix): +def _add_session_column(X, group_by, timestamp_col, session_gap, session_column_name): raise_dispatch_unregistered_type(X, kind="Dataframe") @_add_session_column.specialize("pandas") -def _add_session_column_pandas(X, group_by, timestamp_col, session_gap, suffix): +def _add_session_column_pandas( + X, group_by, timestamp_col, session_gap, session_column_name +): # pandas 3.0 changed the resolution of astype(int) for datetime columns from # nanoseconds to milliseconds, so we need to adjust the time difference calculation # accordingly @@ -58,13 +60,14 @@ def _add_session_column_pandas(X, group_by, timestamp_col, session_gap, suffix): else: is_new_session = time_diff # Compute cumulative sum of is_new_session to create session IDs - column_name = f"{timestamp_col}_{suffix}" - X[column_name] = is_new_session.cumsum() + X[session_column_name] = is_new_session.cumsum() return X @_add_session_column.specialize("polars") -def _add_session_column_polars(X, group_by, timestamp_col, session_gap, suffix): +def _add_session_column_polars( + X, group_by, timestamp_col, session_gap, session_column_name +): # check if the time difference between events exceeds the session gap time_diff = X[timestamp_col].dt.epoch("ms").diff().fill_null(0) > session_gap * 1000 if group_by: @@ -78,8 +81,7 @@ def _add_session_column_polars(X, group_by, timestamp_col, session_gap, suffix): else: is_new_session = time_diff # Add session_id by computing cumulative sum of is_new_session - column_name = f"{timestamp_col}_{suffix}" - return X.with_columns(is_new_session.cum_sum().alias(column_name)) + return X.with_columns(is_new_session.cum_sum().alias(session_column_name)) @dispatch @@ -377,6 +379,9 @@ def fit_transform(self, X, y=None): self._session_id_name = f"{self.timestamp_col}_{self.suffix}" + if self._session_id_name in self.all_inputs_: + self._session_id_name += f"_skrub_{random_string()}" + if sbd.is_empty_frame(X): X = sbd.with_columns( X, **{self._session_id_name: np.array([], dtype=np.float32)} @@ -496,7 +501,7 @@ def _add_session_id(self, X_factorized, factorized_by): factorized_by, self.timestamp_col, self.session_gap, - self.suffix, + self._session_id_name, ) return X_with_session_id diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 77d183cb5..42095d3be 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -710,3 +710,48 @@ def test_empty_frame(df_module): result = encoder.fit_transform(empty_df) assert sbd.column_names(result) == ["timestamp", "timestamp_session_id"] + + +def test_not_overwriting_columns(df_module): + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 10), # 10 min — within gap + datetime.datetime(2024, 1, 1, 11, 0), # 50 min — exceeds gap + datetime.datetime(2024, 1, 1, 11, 5), # 5 min — within gap + ], + "timestamp_session_id": [1, 2, 3, 4], + } + ) + encoder = SessionEncoder("timestamp") + result = encoder.fit_transform(df) + + col_names = sbd.column_names(result) + assert "timestamp" in col_names + assert "timestamp_session_id" in col_names + # The original "timestamp_session_id" column should not be overwritten + # The new column has name "timestamp_session_id_skrub_RANDOM_SUFFIX" + assert col_names[2].removeprefix("timestamp_session_id").startswith("_skrub_") + + # Check that this also works for a custom suffix + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 10), # 10 min — within gap + datetime.datetime(2024, 1, 1, 11, 0), # 50 min — exceeds gap + datetime.datetime(2024, 1, 1, 11, 5), # 5 min — within gap + ], + "timestamp_custom_name": [1, 2, 3, 4], + } + ) + encoder = SessionEncoder("timestamp", suffix="custom_name") + result = encoder.fit_transform(df) + + col_names = sbd.column_names(result) + assert "timestamp" in col_names + assert "timestamp_custom_name" in col_names + # The original "timestamp_custom_name" column should not be overwritten + # The new column has name "timestamp_custom_name_skrub_RANDOM_SUFFIX" + assert col_names[2].removeprefix("timestamp_custom_name").startswith("_skrub_") From b8343952b8ad25671a3c1b5b47f91d9f358b757c Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 28 May 2026 13:57:19 +0200 Subject: [PATCH 34/74] renaming a parameter --- skrub/_session_encoder.py | 65 ++++++++++++++++------------- skrub/tests/test_session_encoder.py | 64 +++++++++++++++------------- 2 files changed, 69 insertions(+), 60 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 0ac2d6ae4..64150e9b9 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -31,13 +31,13 @@ @dispatch -def _add_session_column(X, group_by, timestamp_col, session_gap, session_column_name): +def _add_session_column(X, split_by, timestamp_col, session_gap, session_column_name): raise_dispatch_unregistered_type(X, kind="Dataframe") @_add_session_column.specialize("pandas") def _add_session_column_pandas( - X, group_by, timestamp_col, session_gap, session_column_name + X, split_by, timestamp_col, session_gap, session_column_name ): # pandas 3.0 changed the resolution of astype(int) for datetime columns from # nanoseconds to milliseconds, so we need to adjust the time difference calculation @@ -51,10 +51,10 @@ def _add_session_column_pandas( time_diff = ( X[timestamp_col].astype(int).diff().fillna(0) // 10**3 > session_gap * 1000 ) - if group_by: - # check if the "group_by" column changes - group_diff = (X[group_by].diff().fillna(0) != 0).any(axis=1) - # a new session starts if either the "group_by" column changes or the time + if split_by: + # check if the "split_by" column changes + group_diff = (X[split_by].diff().fillna(0) != 0).any(axis=1) + # a new session starts if either the "split_by" column changes or the time # gap is exceeded is_new_session = group_diff | time_diff else: @@ -66,16 +66,16 @@ def _add_session_column_pandas( @_add_session_column.specialize("polars") def _add_session_column_polars( - X, group_by, timestamp_col, session_gap, session_column_name + X, split_by, timestamp_col, session_gap, session_column_name ): # check if the time difference between events exceeds the session gap time_diff = X[timestamp_col].dt.epoch("ms").diff().fill_null(0) > session_gap * 1000 - if group_by: - # check if the "group_by" column changes + if split_by: + # check if the "split_by" column changes group_diff = X.select( - pl.any_horizontal(pl.col(group_by).diff().fill_null(0) != 0) + pl.any_horizontal(pl.col(split_by).diff().fill_null(0) != 0) ).to_series() - # a new session starts if either the "group_by" column changes or the time + # a new session starts if either the "split_by" column changes or the time # gap is exceeded is_new_session = group_diff | time_diff else: @@ -113,7 +113,8 @@ class SessionEncoder(TransformerMixin, BaseEstimator): A session is defined as a sequence of events where consecutive events are separated by at most ``session_gap`` seconds. Additionally, it is possible to provide a column - or list of columns that identifies the user (specified by the ``group_by`` column). + or list of columns that can be used to distinguish between sessions, such + as user identifiers (specified by the ``split_by`` column). When the time gap between consecutive events exceeds ``session_gap``, or when what identifies a user changes, a new session begins. All unrelated columns are passed through unchanged. @@ -124,7 +125,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): The name of the column that identifies the time of an event. This column is used to determine the start and end of a session. - group_by : optional[str, list[str]], default=None + split_by : optional[str, list[str]], default=None The name of the column, or list of columns, to group by. This parameter is used to group events into sessions by, for example, user. If not provided, sessions are detected based on the time gap between events, and all @@ -157,7 +158,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> from skrub import SessionEncoder >>> from datetime import datetime, timedelta >>> encoder = SessionEncoder( - ... group_by='user_id', timestamp_col='timestamp', session_gap=30 * 60 + ... split_by='user_id', timestamp_col='timestamp', session_gap=30 * 60 ... ) >>> data = { ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], @@ -201,7 +202,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): on different devices should have separate sessions. >>> encoder_multi = SessionEncoder( - ... group_by=['user_id', 'device_id'], + ... split_by=['user_id', 'device_id'], ... timestamp_col='timestamp', ... session_gap=30 * 60 ... ) @@ -246,7 +247,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): timeseries or events that don't have a user dimension: >>> encoder_no_group = SessionEncoder( - ... group_by=None, + ... split_by=None, ... timestamp_col='timestamp', ... session_gap=30 * 60 ... ) @@ -296,7 +297,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... } >>> df = pd.DataFrame(data_multi) >>> encoder_user = SessionEncoder("timestamp", - ... group_by=["user_id"], suffix="user") + ... split_by=["user_id"], suffix="user") >>> encoder_user.fit_transform(df) user_id device_id timestamp action timestamp_user 0 1 mobile 2024-01-01 10:00:00 view 0 @@ -307,7 +308,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): 5 2 mobile 2024-01-01 10:15:00 view 1 >>> encoder_user_device = SessionEncoder("timestamp", - ... group_by=["user_id", "device_id"], + ... split_by=["user_id", "device_id"], ... suffix="user_device") >>> encoder_user_device.fit_transform(df) user_id device_id timestamp action timestamp_user_device @@ -321,10 +322,10 @@ class SessionEncoder(TransformerMixin, BaseEstimator): """ def __init__( - self, timestamp_col, group_by=None, session_gap=30 * 60, suffix="session_id" + self, timestamp_col, split_by=None, session_gap=30 * 60, suffix="session_id" ): self.timestamp_col = timestamp_col - self.group_by = group_by + self.split_by = split_by self.session_gap = session_gap self.suffix = suffix @@ -379,9 +380,13 @@ def fit_transform(self, X, y=None): self._session_id_name = f"{self.timestamp_col}_{self.suffix}" + # If the generated session id column name already exists in the input dataframe, + # we add a random suffix to avoid overwriting it if self._session_id_name in self.all_inputs_: self._session_id_name += f"_skrub_{random_string()}" + # if the input dataframe is empty, we can skip all the processing and + # return an empty dataframe with the session_id column added if sbd.is_empty_frame(X): X = sbd.with_columns( X, **{self._session_id_name: np.array([], dtype=np.float32)} @@ -405,7 +410,7 @@ def fit_transform(self, X, y=None): # sort the input dataframe by the "group_by" and "timestamp" columns sort_by = ( self._group_by_columns + [self.timestamp_col] - if self.group_by is not None + if self.split_by is not None else [self.timestamp_col] ) X_sorted = sbd.sort(X_selected, by=sort_by) @@ -463,26 +468,26 @@ def _check_input_dataframe(self): ) # check that the required columns are present in the input dataframe self._group_by_columns = [] - if self.group_by is not None: - if isinstance(self.group_by, str): - self._group_by_columns = [self.group_by] - elif isinstance(self.group_by, Iterable) and not isinstance( - self.group_by, str + if self.split_by is not None: + if isinstance(self.split_by, str): + self._group_by_columns = [self.split_by] + elif isinstance(self.split_by, Iterable) and not isinstance( + self.split_by, str ): - self._group_by_columns = list(self.group_by) + self._group_by_columns = list(self.split_by) else: - raise TypeError("group_by must be a string, a list of strings, or None") + raise TypeError("split_by must be a string, a list of strings, or None") for col in self._group_by_columns: if col not in self.all_inputs_: raise ValueError(f"Column '{col}' not found in input dataframe") def _factorize_columns(self, X): """ - convert group_by columns to numerical columns if they're not already, to + convert split_by columns to numerical columns if they're not already, to ensure that the diff operation works correctly """ - if not self.group_by: + if not self.split_by: return X, [] factorized_columns = { f"{col}_factorized_skrub_{random_string()}": _factorize_column(X, col) diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 42095d3be..dcba9968e 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -61,7 +61,7 @@ def example_session_data_multi_by(df_module): A user is uniquely identified by the combination of ``user_id`` and ``device_id``. The same ``user_id`` on two different devices produces - independent sessions, which lets us verify that ``group_by`` accepts a list of + independent sessions, which lets us verify that ``split_by`` accepts a list of column names. """ timestamps = [] @@ -103,19 +103,19 @@ def example_session_data_multi_by(df_module): @pytest.mark.parametrize( - "by_column,expected_sessions,group_key_to_sessions", + "by_column,expected_sessions,split_key_to_sessions", [ ("user_id", 6, {101: 3, 102: 2, 103: 1}), ("username", 6, {"alice": 3, "bob": 2, "charlie": 1}), ], ) def test_session_encoder_basic( - example_session_data, by_column, expected_sessions, group_key_to_sessions + example_session_data, by_column, expected_sessions, split_key_to_sessions ): """Test basic sessionization grouping by user_id or username.""" # Apply SessionEncoder grouping by the specified column se = SessionEncoder( - group_by=by_column, timestamp_col="timestamp", session_gap=30 * 60 + split_by=by_column, timestamp_col="timestamp", session_gap=30 * 60 ) result = se.fit_transform(example_session_data) @@ -135,7 +135,7 @@ def test_session_encoder_basic( counted_sessions[group_key] = set() counted_sessions[group_key].add(session_id) for group_key, sessions in counted_sessions.items(): - assert len(sessions) == group_key_to_sessions[group_key] + assert len(sessions) == split_key_to_sessions[group_key] # Checking that fit then transform still works result_fit = se.fit(example_session_data).transform(example_session_data) @@ -158,7 +158,7 @@ def test_session_encoder_different_users_different_sessions( """Test that different users/groups have different session IDs.""" # Apply SessionEncoder se = SessionEncoder( - group_by=by_column, timestamp_col="timestamp", session_gap=30 * 60 + split_by=by_column, timestamp_col="timestamp", session_gap=30 * 60 ) result = se.fit_transform(example_session_data) @@ -196,7 +196,7 @@ def test_session_encoder_multi_by_columns(example_session_data_multi_by): Total: 4 sessions """ se = SessionEncoder( - group_by=["user_id", "device_id"], + split_by=["user_id", "device_id"], timestamp_col="timestamp", session_gap=30 * 60, ) @@ -251,7 +251,7 @@ def test_session_encoder_multiple_users(df_module): ) se = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap=30 * 60 + split_by="user_id", timestamp_col="timestamp", session_gap=30 * 60 ) result = se.fit_transform(df) @@ -283,7 +283,7 @@ def test_session_encoder_time_gap_threshold(df_module): # With 20-minute gap: should create 2 sessions (split at 35-min gap) se_20 = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap=20 * 60 + split_by="user_id", timestamp_col="timestamp", session_gap=20 * 60 ) result_20 = se_20.fit_transform(df) session_ids_20 = sbd.to_list(sbd.col(result_20, "timestamp_session_id")) @@ -291,7 +291,7 @@ def test_session_encoder_time_gap_threshold(df_module): # With 40-minute gap: should create 1 session (all gaps < 40 min) se_40 = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap=40 * 60 + split_by="user_id", timestamp_col="timestamp", session_gap=40 * 60 ) result_40 = se_40.fit_transform(df) session_ids_40 = sbd.to_list(sbd.col(result_40, "timestamp_session_id")) @@ -301,7 +301,7 @@ def test_session_encoder_time_gap_threshold(df_module): def test_session_encoder_no_user_column(df_module): """Test sessionization without a user identifier column. - When ``group_by`` is None, all events are treated as from the same "user", and + When ``split_by`` is None, all events are treated as from the same "user", and sessions are separated only by time gaps. """ timestamps = [ @@ -319,7 +319,7 @@ def test_session_encoder_no_user_column(df_module): ) # Without 'group_by', sessions are separated only by time gaps - se = SessionEncoder(group_by=None, timestamp_col="timestamp", session_gap=30 * 60) + se = SessionEncoder(split_by=None, timestamp_col="timestamp", session_gap=30 * 60) result = se.fit_transform(df) session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) @@ -342,7 +342,7 @@ def test_session_encoder_single_event(df_module): } ) - se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) + se = SessionEncoder(split_by="user_id", timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) session_ids = sbd.to_list(sbd.col(result, "timestamp_session_id")) @@ -360,7 +360,7 @@ def test_session_encoder_empty_dataframe(df_module): } ) - se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) + se = SessionEncoder(split_by="user_id", timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) assert sbd.shape(result)[0] == 0 @@ -392,7 +392,7 @@ def test_session_encoder_empty_dataframe(df_module): 23, # invalid type for 'group_by' "timestamp", TypeError, - "group_by must be a string, a list of strings, or None", + "split_by must be a string, a list of strings, or None", ), ], ) @@ -413,7 +413,7 @@ def test_session_encoder_missing_column_error( ) se = SessionEncoder( - group_by=group_by_param, + split_by=group_by_param, timestamp_col=timestamp_col_param, ) with pytest.raises(expected_error_type, match=expected_error_match): @@ -431,28 +431,28 @@ def test_session_encoder_invalid_parameters(df_module): # Test non-numeric session_gap se_non_numeric = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap="thirty" + split_by="user_id", timestamp_col="timestamp", session_gap="thirty" ) with pytest.raises(TypeError, match="Expected a number"): se_non_numeric.fit_transform(df) # Test negative session_gap se_negative = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap=-10 + split_by="user_id", timestamp_col="timestamp", session_gap=-10 ) with pytest.raises(ValueError, match="session_gap must be a positive number"): se_negative.fit_transform(df) # Test zero session_gap se_zero = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", session_gap=0 + split_by="user_id", timestamp_col="timestamp", session_gap=0 ) with pytest.raises(ValueError, match="session_gap must be a positive number"): se_zero.fit_transform(df) # Test invalid suffix (None) se_invalid_suffix = SessionEncoder( - group_by="user_id", timestamp_col="timestamp", suffix=None + split_by="user_id", timestamp_col="timestamp", suffix=None ) with pytest.raises(ValueError, match="Expected a string as suffix"): se_invalid_suffix.fit_transform(df) @@ -471,7 +471,7 @@ def test_session_encoder_preserves_columns(df_module): } ) - se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) + se = SessionEncoder(split_by="user_id", timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) result_cols = sbd.column_names(result) @@ -493,7 +493,7 @@ def test_session_encoder_fit_and_transform(df_module): } ) - se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) + se = SessionEncoder(split_by="user_id", timestamp_col="timestamp", session_gap=30) # Test fit returns self se_fitted = se.fit(df) @@ -515,7 +515,7 @@ def test_get_feature_names(df_module): } ) - se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) + se = SessionEncoder(split_by="user_id", timestamp_col="timestamp", session_gap=30) se.fit(df) feature_names = se.get_feature_names_out() @@ -563,7 +563,9 @@ def test_check_is_new_session_no_by(df_module): ) session_id = sbd.to_list( sbd.col( - _add_session_column(df, [], "timestamp", 30 * 60, suffix="session_id"), + _add_session_column( + df, [], "timestamp", 30 * 60, session_column_name="timestamp_session_id" + ), "timestamp_session_id", ) ) @@ -618,7 +620,9 @@ def test_check_is_new_session_with_by(df_module): ], } ) - result = _add_session_column(df, ["user_id"], "timestamp", 30 * 60, "session_id") + result = _add_session_column( + df, ["user_id"], "timestamp", 30 * 60, "timestamp_session_id" + ) # _add_session_column now returns the full dataframe with session_id added assert "timestamp_session_id" in sbd.column_names(result) @@ -649,11 +653,11 @@ def test_proper_suffix(timestamp, suffix, df_module): if suffix is None: with pytest.raises(ValueError, match="Expected a string as suffix*"): SessionEncoder( - timestamp_col=timestamp, group_by="user_id", suffix=suffix + timestamp_col=timestamp, split_by="user_id", suffix=suffix ).fit_transform(df) else: result = SessionEncoder( - timestamp_col=timestamp, group_by="user_id", suffix=suffix + timestamp_col=timestamp, split_by="user_id", suffix=suffix ).fit_transform(df) # _add_session_column now returns the full dataframe with session_id added expected_name = f"{timestamp}_{suffix}" @@ -677,7 +681,7 @@ def test_session_encoder_preserves_input_order(df_module): df = df_module.make_dataframe({"timestamp": timestamps, "user_id": user_ids}) - se = SessionEncoder(group_by="user_id", timestamp_col="timestamp", session_gap=30) + se = SessionEncoder(split_by="user_id", timestamp_col="timestamp", session_gap=30) result = se.fit_transform(df) # The user_id column must still be in the original order @@ -691,10 +695,10 @@ def test_session_encoder_preserves_input_order(df_module): ( partial( _add_session_column, - group_by=[], + split_by=[], timestamp_col="timestamp", session_gap=30, - suffix="_session_id", + session_column_name="timestamp_session_id", ), partial(_factorize_column, column_name="user_id"), ), From 091c1227d60c97386410ba10d41229ddb27a91b9 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 28 May 2026 14:04:08 +0200 Subject: [PATCH 35/74] _ --- skrub/tests/test_session_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index dcba9968e..5471cbfb1 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -588,7 +588,7 @@ def test_add_session_column_old_pandas(df_module): ) session_id = sbd.to_list( sbd.col( - _add_session_column(df, [], "timestamp", 30 * 60, suffix="session_id"), + _add_session_column(df, [], "timestamp", 30 * 60, "timestamp_session_id"), "timestamp_session_id", ) ) From 6de367ca5e802c358aa5547274514c6d4fbdb5ff Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Thu, 28 May 2026 14:14:45 +0200 Subject: [PATCH 36/74] fixing a bug on windows --- skrub/_session_encoder.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 64150e9b9..9ba9faabe 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -42,14 +42,19 @@ def _add_session_column_pandas( # pandas 3.0 changed the resolution of astype(int) for datetime columns from # nanoseconds to milliseconds, so we need to adjust the time difference calculation # accordingly + # + # astype(int64) is needed (rather than just int) because on windows this converts + # to int32 if parse(pd.__version__).major <= 2: # check if the time difference between events exceeds the session gap time_diff = ( - X[timestamp_col].astype(int).diff().fillna(0) // 10**6 > session_gap * 1000 + X[timestamp_col].astype("int64").diff().fillna(0) // 10**6 + > session_gap * 1000 ) else: time_diff = ( - X[timestamp_col].astype(int).diff().fillna(0) // 10**3 > session_gap * 1000 + X[timestamp_col].astype("int64").diff().fillna(0) // 10**3 + > session_gap * 1000 ) if split_by: # check if the "split_by" column changes From 528006b47de6cad4704863df43d546989ca5e734 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 29 May 2026 15:52:06 +0200 Subject: [PATCH 37/74] adding new generator and example --- examples/data_ops/1170_session_encoder.py | 152 +++++++++++++++ skrub/_session_encoder.py | 21 +- skrub/datasets/__init__.py | 9 +- skrub/datasets/_generating.py | 228 ++++++++++++++++++++++ 4 files changed, 393 insertions(+), 17 deletions(-) create mode 100644 examples/data_ops/1170_session_encoder.py diff --git a/examples/data_ops/1170_session_encoder.py b/examples/data_ops/1170_session_encoder.py new file mode 100644 index 000000000..3f7e452f6 --- /dev/null +++ b/examples/data_ops/1170_session_encoder.py @@ -0,0 +1,152 @@ +""" +Use SessionEncoder in DataOps to predict purchases +================================================== +This example shows how to use |SessionEncoder| in a skrub DataOps workflow. +We will: + +1. Generate synthetic retail event data +2. Build a baseline classifier on raw event-level features +3. Add session-level and historical features +4. Train the same model again and compare ROC-AUC + +The data comes from |make_retail_events| and includes columns such as event type, +device type, viewed price, and timestamp. The target is binary: whether the +session eventually contains a purchase event. +""" + +# %% +import skrub +from skrub.datasets import make_retail_events + +# %% +events = make_retail_events(n_users=20, n_events=5000, random_state=0) +# %% +# Mark feature and target data with |skrub.X| and |skrub.y| so they can be used +# in a DataOps workflow. + +X, y = skrub.X(events.X), skrub.y(events.y) + +# %% +# As a sanity check, evaluate a |DummyClassifier| on the original event data +# (without session features). We expect chance-level performance +# (ROC-AUC of 0.5). +from sklearn.dummy import DummyClassifier + +dummy = DummyClassifier(strategy="most_frequent") +dummy_pred = X.skb.apply(dummy, y=y) +dummy_learner = dummy_pred.skb.make_learner() +# %% +# Because this is temporal data, we use a time-aware CV strategy. +# We reuse the same splitter for all evaluations. +from sklearn.model_selection import TimeSeriesSplit + +splitter = TimeSeriesSplit(n_splits=5) +dummy_results = skrub.cross_validate( + dummy_learner, environment=dummy_pred.skb.get_data(), cv=splitter, scoring="roc_auc" +) +print(f"ROC-AUC with DummyClassifier: {dummy_results['test_score'].mean():.3f}") + +# %% +# Try a real model with |tabular_pipeline|, first on raw event-level data. +from skrub import tabular_pipeline + +model = tabular_pipeline("classification") + +pred = X.skb.apply(model, y=y) +learner = pred.skb.make_learner() +results = skrub.cross_validate( + learner, environment=pred.skb.get_data(), cv=splitter, scoring="roc_auc" +) +print(f"ROC-AUC without session encoding: {results['test_score'].mean():.3f}") + +# %% +# This baseline is limited because it cannot directly use session-level behavior +# (for example, whether "add_to_cart" happened in the same session). +# +# Next, create sessions with |SessionEncoder|. We define boundaries from +# ``timestamp`` within each ``user_id``. A new session starts after more than +# 30 minutes of inactivity (``session_gap`` is in seconds). +# %% +from skrub import SessionEncoder + +se = SessionEncoder("timestamp", split_by="user_id", session_gap=30 * 60) +X_sessions = X.skb.apply(se) + +# %% +# ``timestamp_session_id`` identifies the session of each event. +# We use it to compute session-level aggregates and join them back to event-level rows. +# +# We will compute the following session-level features: +# - ``session_has_add_to_cart``: whether the session includes at least one "add_to_cart" +# event +# - ``session_n_events``: the total number of events in the session +# - ``session_mean_price``: the mean price viewed during the session +# - ``session_dominant_device``: the most frequently used device type in the session +# - ``event_rank_in_session``: the rank of the event within its session (0 for the +# first event, 1 for the second, etc.) +# - ``is_last_event_in_session``: whether the event is the last event in its session +# +# We also compute one user-level historical feature after sorting by timestamp: +# - ``time_since_last_event``: the time in seconds since the previous event for the +# same user (NaN for the first event of each user) + + +def most_frequent(series): + # mode() can return multiple values; use the first one + # for a deterministic tie-break. + return series.mode().iat[0] + + +def compute_session_features(df): + session_agg = df.groupby("timestamp_session_id").agg( + session_has_add_to_cart=("event_type", lambda x: "add_to_cart" in x.values), + session_n_events=("event_type", "count"), + session_mean_price=("price_viewed", "mean"), + session_dominant_device=("device_type", most_frequent), + ) + df = df.join(session_agg, on="timestamp_session_id") + grouped = df.groupby("timestamp_session_id") + df["event_rank_in_session"] = grouped.cumcount() + session_sizes = grouped["event_type"].transform("size") + df["is_last_event_in_session"] = df["event_rank_in_session"].eq(session_sizes - 1) + return df + + +def compute_historical_features(df): + # Preserve input row order after timestamp-based computations. + df["_row_order"] = df.index + df = df.sort_values("timestamp") + df["time_since_last_event"] = ( + df.groupby("user_id")["timestamp"].diff().dt.total_seconds() + ) + df = df.sort_values("_row_order").drop(columns="_row_order") + return df + + +X_enriched = X_sessions.skb.apply_func(compute_session_features) +X_enriched = X_enriched.skb.apply_func(compute_historical_features) +X_enriched +# %% +# Now we can train the same model on the enriched data with session-level features +# and see if the performance improves. +model = tabular_pipeline("classification") +pred_enriched = X_enriched.skb.apply(model, y=y) +learner_enriched = pred_enriched.skb.make_learner() +results_enriched = skrub.cross_validate( + learner_enriched, + environment=pred_enriched.skb.get_data(), + cv=splitter, + scoring="roc_auc", +) +print(f"ROC-AUC with session encoding: {results_enriched['test_score'].mean():.3f}") + +# %% +# The enriched model should outperform the baseline, showing the value of +# session-level context for conversion prediction. +# +# In DataOps, these aggregations are evaluated with temporal ordering in mind, +# which helps prevent leakage: features for an event are computed only from data +# available up to that event timestamp. +# +# This example focuses on SessionEncoder usage, so we intentionally keep modeling +# simple (no hyperparameter tuning and only a small set of engineered features). diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 9ba9faabe..54d451312 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -15,7 +15,6 @@ import numpy as np import pandas as pd -from packaging.version import parse from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted @@ -39,23 +38,13 @@ def _add_session_column(X, split_by, timestamp_col, session_gap, session_column_ def _add_session_column_pandas( X, split_by, timestamp_col, session_gap, session_column_name ): - # pandas 3.0 changed the resolution of astype(int) for datetime columns from - # nanoseconds to milliseconds, so we need to adjust the time difference calculation - # accordingly - # # astype(int64) is needed (rather than just int) because on windows this converts # to int32 - if parse(pd.__version__).major <= 2: - # check if the time difference between events exceeds the session gap - time_diff = ( - X[timestamp_col].astype("int64").diff().fillna(0) // 10**6 - > session_gap * 1000 - ) - else: - time_diff = ( - X[timestamp_col].astype("int64").diff().fillna(0) // 10**3 - > session_gap * 1000 - ) + # check if the time difference between events exceeds the session gap + # dividing by 10**6 because int64 is in nanoseconds, while session_gap is in seconds + time_diff = ( + X[timestamp_col].astype("int64").diff().fillna(0) // 10**6 > session_gap * 1000 + ) if split_by: # check if the "split_by" column changes group_diff = (X[split_by].diff().fillna(0) != 0).any(axis=1) diff --git a/skrub/datasets/__init__.py b/skrub/datasets/__init__.py index a00b9bb8f..e6130f07d 100644 --- a/skrub/datasets/__init__.py +++ b/skrub/datasets/__init__.py @@ -14,7 +14,13 @@ fetch_traffic_violations, fetch_videogame_sales, ) -from ._generating import make_deduplication_data, toy_cities, toy_orders, toy_products +from ._generating import ( + make_deduplication_data, + make_retail_events, + toy_cities, + toy_orders, + toy_products, +) from ._utils import get_data_dir __all__ = [ @@ -34,6 +40,7 @@ "fetch_videogame_sales", "get_data_dir", "make_deduplication_data", + "make_retail_events", "toy_orders", "toy_products", "toy_cities", diff --git a/skrub/datasets/_generating.py b/skrub/datasets/_generating.py index 2d4b0c2d0..8d9748a37 100644 --- a/skrub/datasets/_generating.py +++ b/skrub/datasets/_generating.py @@ -298,3 +298,231 @@ def toy_cities(seed=0, size=1000, nulls=0.1, n_metrics=4): df = pd.concat((df_cities, df_dates, df_metrics), axis=1) return df + + +def make_retail_events(n_users=200, n_events=5000, random_state=None): + """Generate a synthetic e-commerce clickstream dataset for classification. + + Each row represents one user interaction event on a retail platform. + The dataset is designed to showcase :class:`~skrub.SessionEncoder` (which + groups events into sessions using ``user_id`` and ``timestamp``), + :class:`~skrub.DatetimeEncoder` (which extracts hour-of-day, day-of-week, + etc. from ``timestamp``), one-hot encoding of categorical features, and + scaling of numerical features. + + The binary target ``converted`` indicates whether a purchase occurred + during the session that contains this event. All events belonging to the + same session share the same ``converted`` value (a session either converts + or it does not). The probability of conversion is determined at the + session level by the most intent-rich event type in the session + (``add_to_cart`` > ``wishlist`` > ``search`` > ``page_view``), the + dominant device, and the mean price viewed — so the signal is learnable + directly from the observable features. + + Parameters + ---------- + n_users : int, default=200 + Number of distinct users in the dataset. + + n_events : int, default=5000 + Approximate total number of events (rows) to generate. The actual + count may differ slightly because session sizes are drawn from a + Poisson distribution. + + random_state : int or RandomState instance, optional + Controls the random number generation for reproducibility. + + Returns + ------- + bunch : :class:`~sklearn.utils.Bunch` + A dictionary-like object with the following attributes: + + - ``X`` : :class:`~pandas.DataFrame` with columns: + + - ``user_id`` : str — user identifier, suitable for + ``SessionEncoder(split_by="user_id", ...)``. + - ``timestamp`` : :class:`~pandas.Timestamp` — event time, suitable + for ``SessionEncoder(timestamp_col="timestamp", ...)`` and + :class:`~skrub.DatetimeEncoder`. + - ``device_type`` : str — one of ``"mobile"``, ``"desktop"``, + ``"tablet"``; encode with :class:`~sklearn.preprocessing.OneHotEncoder`. + - ``page_category`` : str — one of ``"electronics"``, ``"fashion"``, + ``"home"``, ``"sports"``, ``"books"``; encode with + :class:`~sklearn.preprocessing.OneHotEncoder`. + - ``event_type`` : str — one of ``"page_view"``, ``"search"``, + ``"add_to_cart"``, ``"wishlist"``; encode with + :class:`~sklearn.preprocessing.OneHotEncoder`. + - ``time_on_page`` : float — seconds spent on the page (exponential + distribution, mean ≈ 120 s). + - ``price_viewed`` : float — price of the item viewed (log-normal). + + - ``y`` : :class:`~pandas.Series` of bool, name ``"converted"`` — the + classification target. + + Examples + -------- + >>> from skrub.datasets import make_retail_events + >>> bunch = make_retail_events(n_users=20, n_events=100, random_state=0) + >>> bunch.X.shape[1] # 7 feature columns; rows ≈ n_events + 7 + >>> bunch.X.columns.tolist() + ['user_id', 'timestamp', 'device_type', 'page_category', 'event_type', 'time_on_page', 'price_viewed'] + >>> bunch.y.name + 'converted' + >>> bunch.y.dtype + dtype('bool') + """ # noqa: E501 + rng = check_random_state(random_state) + + # --- users ----------------------------------------------------------- + user_ids = [f"user_{i:04d}" for i in range(n_users)] + + # Distribute events across users with a power-law (Pareto) weight so that + # a small number of users generate the bulk of the activity. + activity_weights = rng.pareto(2.0, size=n_users) + 1.0 + activity_weights /= activity_weights.sum() + events_per_user = rng.multinomial(n_events, activity_weights) + + # --- timestamps with realistic session structure --------------------- + # Events form *sessions*: bursts of activity where consecutive events are + # only ~90 s apart, separated by long idle gaps (at least 2 h). This + # structure is what SessionEncoder is designed to detect. + # + # For each user: + # 1. Split their event budget into sessions of Poisson(3)+1 events. + # 2. Space session starts by Exponential gaps >> session_gap, spread + # across a 90-day window. + # 3. Within each session, place events with Exponential(90 s) gaps. + base_time = pd.Timestamp("2024-01-01") + total_window_s = 90 * 24 * 3600 # 90 days + within_session_mean_s = 90.0 # ~1.5 min between events inside a session + min_between_session_s = 2 * 3600 # 2 h minimum gap — well above session_gap + + all_user_ids: list = [] + all_timestamps: list = [] + all_session_keys: list = [] # unique key per (user, session) pair + + for uid, n_user_events in zip(user_ids, events_per_user): + if n_user_events == 0: + continue + + # Split into sessions; each session has at least 1 event. + session_sizes = [] + remaining = int(n_user_events) + while remaining > 0: + size = min(int(rng.poisson(3)) + 1, remaining) + session_sizes.append(size) + remaining -= size + + n_sessions = len(session_sizes) + + # Session start times: inter-session gaps drawn from Exponential so + # that they are spread over the 90-day window but always exceed the + # minimum between-session gap. + mean_gap_s = max(total_window_s / n_sessions, min_between_session_s) + inter_gaps = rng.exponential(scale=mean_gap_s, size=n_sessions) + # Random offset for the very first session start. + inter_gaps[0] += rng.uniform(0, min_between_session_s) + session_starts_s = np.cumsum(inter_gaps) + + for sess_idx, (start_s, sess_size) in enumerate( + zip(session_starts_s, session_sizes) + ): + # Events are placed at start_s, start_s+gap1, start_s+gap1+gap2 … + within_gaps = np.concatenate( + [[0.0], rng.exponential(within_session_mean_s, size=sess_size - 1)] + ) + session_key = f"{uid}_{sess_idx}" + for offset_s in start_s + np.cumsum(within_gaps): + all_user_ids.append(uid) + all_timestamps.append(base_time + pd.Timedelta(seconds=float(offset_s))) + all_session_keys.append(session_key) + + n_actual = len(all_user_ids) + + # --- categorical features -------------------------------------------- + device_type = rng.choice( + ["mobile", "desktop", "tablet"], + size=n_actual, + p=[0.55, 0.35, 0.10], + ) + page_category = rng.choice( + ["electronics", "fashion", "home", "sports", "books"], + size=n_actual, + ) + event_type = rng.choice( + ["page_view", "search", "add_to_cart", "wishlist"], + size=n_actual, + p=[0.60, 0.20, 0.15, 0.05], + ) + + # --- numerical features ---------------------------------------------- + # time_on_page: seconds spent on page (heavy-tailed) + time_on_page = rng.exponential(scale=120.0, size=n_actual).round(1) + # price_viewed: item price in USD (log-normal, median ≈ e^3.5 ≈ 33) + price_viewed = np.exp(rng.normal(loc=3.5, scale=1.2, size=n_actual)).round(2) + + # --- assemble & sort ------------------------------------------------- + X = pd.DataFrame( + { + "user_id": all_user_ids, + "timestamp": all_timestamps, + "_session_key": all_session_keys, + "device_type": device_type, + "page_category": page_category, + "event_type": event_type, + "time_on_page": time_on_page, + "price_viewed": price_viewed, + } + ) + # Sorting by timestamp + X = X.sort_values(["timestamp"]).reset_index(drop=True) + + # --- target: converted (bool), assigned per session ------------------ + # A session either converts or it does not — all events in a session + # share the same label. This is the realistic framing: a checkout + # either happens in a session or it doesn't. + # + # The conversion probability is a logistic function of session-level + # summaries of observable features, so the model can learn it: + # + # best_event : the most purchase-intent event type in the session + # (add_to_cart >> wishlist >> search >> page_view) + # device : dominant device (desktop > tablet > mobile) + # price : mean price viewed (expensive items convert less) + event_intent = X["event_type"].map( + {"add_to_cart": 2.0, "wishlist": 0.8, "search": 0.0, "page_view": -0.5} + ) + device_score_col = X["device_type"].map( + {"desktop": 0.5, "tablet": 0.1, "mobile": -0.3} + ) + price_score_col = -0.2 * np.log1p(X["price_viewed"]) + + tmp = X[["_session_key"]].assign( + event_intent=event_intent, + device_score=device_score_col, + price_score=price_score_col, + ) + session_logits = ( + tmp.groupby("_session_key") + .agg( + event_intent=("event_intent", "max"), + device_score=("device_score", "mean"), + price_score=("price_score", "mean"), + ) + .sum(axis=1) + ) + + # Add one noise draw per session (not per event) so the label is + # consistent within a session. + unique_keys = session_logits.index.tolist() + noise = dict(zip(unique_keys, rng.normal(0.0, 0.5, size=len(unique_keys)))) + session_prob = { + k: 1.0 / (1.0 + np.exp(-(session_logits[k] + noise[k]))) for k in unique_keys + } + session_converted = {k: bool(rng.binomial(1, session_prob[k])) for k in unique_keys} + + y = X["_session_key"].map(session_converted).rename("converted").astype(bool) + X = X.drop(columns=["_session_key"]) + + return Bunch(X=X, y=y) From d26222f411c96443e4eab0ce2459d49a58125ef0 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 29 May 2026 16:41:44 +0200 Subject: [PATCH 38/74] adding comments --- skrub/datasets/_generating.py | 43 ++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/skrub/datasets/_generating.py b/skrub/datasets/_generating.py index 8d9748a37..2fc479d2f 100644 --- a/skrub/datasets/_generating.py +++ b/skrub/datasets/_generating.py @@ -345,15 +345,13 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): for ``SessionEncoder(timestamp_col="timestamp", ...)`` and :class:`~skrub.DatetimeEncoder`. - ``device_type`` : str — one of ``"mobile"``, ``"desktop"``, - ``"tablet"``; encode with :class:`~sklearn.preprocessing.OneHotEncoder`. + ``"tablet"``. - ``page_category`` : str — one of ``"electronics"``, ``"fashion"``, - ``"home"``, ``"sports"``, ``"books"``; encode with - :class:`~sklearn.preprocessing.OneHotEncoder`. + ``"home"``, ``"sports"``, ``"books"``. - ``event_type`` : str — one of ``"page_view"``, ``"search"``, - ``"add_to_cart"``, ``"wishlist"``; encode with - :class:`~sklearn.preprocessing.OneHotEncoder`. + ``"add_to_cart"``, ``"wishlist"``. - ``time_on_page`` : float — seconds spent on the page (exponential - distribution, mean ≈ 120 s). + distribution, mean ~ 120 s). - ``price_viewed`` : float — price of the item viewed (log-normal). - ``y`` : :class:`~pandas.Series` of bool, name ``"converted"`` — the @@ -363,7 +361,7 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): -------- >>> from skrub.datasets import make_retail_events >>> bunch = make_retail_events(n_users=20, n_events=100, random_state=0) - >>> bunch.X.shape[1] # 7 feature columns; rows ≈ n_events + >>> bunch.X.shape[1] # 7 feature columns; rows ~ n_events 7 >>> bunch.X.columns.tolist() ['user_id', 'timestamp', 'device_type', 'page_category', 'event_type', 'time_on_page', 'price_viewed'] @@ -381,12 +379,12 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): # a small number of users generate the bulk of the activity. activity_weights = rng.pareto(2.0, size=n_users) + 1.0 activity_weights /= activity_weights.sum() + # The multinomial draw gives us the number of events per user, summing to n_events. events_per_user = rng.multinomial(n_events, activity_weights) # --- timestamps with realistic session structure --------------------- # Events form *sessions*: bursts of activity where consecutive events are - # only ~90 s apart, separated by long idle gaps (at least 2 h). This - # structure is what SessionEncoder is designed to detect. + # only ~90 s apart, separated by long idle gaps (at least 2 h). # # For each user: # 1. Split their event budget into sessions of Poisson(3)+1 events. @@ -407,6 +405,8 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): continue # Split into sessions; each session has at least 1 event. + # The size of each session (number of events) is drawn from a Poisson + # distribution with mean 3, plus 1 to ensure at least one event per session. session_sizes = [] remaining = int(n_user_events) while remaining > 0: @@ -414,6 +414,7 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): session_sizes.append(size) remaining -= size + # Number of sessions per user n_sessions = len(session_sizes) # Session start times: inter-session gaps drawn from Exponential so @@ -457,9 +458,9 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): ) # --- numerical features ---------------------------------------------- - # time_on_page: seconds spent on page (heavy-tailed) + # time_on_page: seconds spent on page time_on_page = rng.exponential(scale=120.0, size=n_actual).round(1) - # price_viewed: item price in USD (log-normal, median ≈ e^3.5 ≈ 33) + # price_viewed: item price in USD (log-normal, median ~ e^3.5 ~ 33) price_viewed = np.exp(rng.normal(loc=3.5, scale=1.2, size=n_actual)).round(2) # --- assemble & sort ------------------------------------------------- @@ -480,8 +481,7 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): # --- target: converted (bool), assigned per session ------------------ # A session either converts or it does not — all events in a session - # share the same label. This is the realistic framing: a checkout - # either happens in a session or it doesn't. + # share the same label. # # The conversion probability is a logistic function of session-level # summaries of observable features, so the model can learn it: @@ -490,6 +490,8 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): # (add_to_cart >> wishlist >> search >> page_view) # device : dominant device (desktop > tablet > mobile) # price : mean price viewed (expensive items convert less) + + # First, add the weights corresponding to each feature to the event-level dataframe. event_intent = X["event_type"].map( {"add_to_cart": 2.0, "wishlist": 0.8, "search": 0.0, "page_view": -0.5} ) @@ -498,11 +500,17 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): ) price_score_col = -0.2 * np.log1p(X["price_viewed"]) + # Then, select only the session key and the relevant features tmp = X[["_session_key"]].assign( event_intent=event_intent, device_score=device_score_col, price_score=price_score_col, ) + # Now aggregate at the session level. + # - For event_intent, we take the max to get the most purchase-intent event in + # the session. + # - For device_score and price_score, we take the mean across events in the session + # Then, to get a single session-level logit, we sum the three aggregated features. session_logits = ( tmp.groupby("_session_key") .agg( @@ -517,12 +525,21 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): # consistent within a session. unique_keys = session_logits.index.tolist() noise = dict(zip(unique_keys, rng.normal(0.0, 0.5, size=len(unique_keys)))) + # add the noise to the session logits, then apply the logistic function to get a + # conversion probability per session, then draw the binary label from a Bernoulli. + # For every session key k, session_prob[k] is the conversion probability of + # that session, and session_converted[k] is the binary label indicating whether + # that session converted or not. session_prob = { k: 1.0 / (1.0 + np.exp(-(session_logits[k] + noise[k]))) for k in unique_keys } session_converted = {k: bool(rng.binomial(1, session_prob[k])) for k in unique_keys} + # Finally, get the label for each event by mapping its session key to the + # session_converted dict. This is our y y = X["_session_key"].map(session_converted).rename("converted").astype(bool) + # Drop the session key (the SessionEncoder should be able to recover it from + # user_id and timestamp) X = X.drop(columns=["_session_key"]) return Bunch(X=X, y=y) From 26e4436bfb4403ada82c88205e4962673cfabfd7 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 29 May 2026 16:59:54 +0200 Subject: [PATCH 39/74] fixing inconsistency --- skrub/_session_encoder.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 54d451312..3b5a90c62 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -41,10 +41,8 @@ def _add_session_column_pandas( # astype(int64) is needed (rather than just int) because on windows this converts # to int32 # check if the time difference between events exceeds the session gap - # dividing by 10**6 because int64 is in nanoseconds, while session_gap is in seconds - time_diff = ( - X[timestamp_col].astype("int64").diff().fillna(0) // 10**6 > session_gap * 1000 - ) + # dividing by 10**6 because int64 is in us, while session_gap is in seconds + time_diff = X[timestamp_col].astype("int64").diff().fillna(0) // 10**6 > session_gap if split_by: # check if the "split_by" column changes group_diff = (X[split_by].diff().fillna(0) != 0).any(axis=1) @@ -63,7 +61,12 @@ def _add_session_column_polars( X, split_by, timestamp_col, session_gap, session_column_name ): # check if the time difference between events exceeds the session gap - time_diff = X[timestamp_col].dt.epoch("ms").diff().fill_null(0) > session_gap * 1000 + # setting the time unit to "us" (microseconds) for consistency with pandas + # int64 representation of timestamps, and dividing by 10**6 because session_gap is + # in seconds + time_diff = ( + X[timestamp_col].dt.epoch("us").diff().fill_null(0) // 10**6 > session_gap + ) if split_by: # check if the "split_by" column changes group_diff = X.select( From fecc532a022ee7c1ef2fbb2da15e61efa2ec5acd Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 29 May 2026 17:23:28 +0200 Subject: [PATCH 40/74] fixing possible bug --- skrub/_session_encoder.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 3b5a90c62..032648e95 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -41,8 +41,11 @@ def _add_session_column_pandas( # astype(int64) is needed (rather than just int) because on windows this converts # to int32 # check if the time difference between events exceeds the session gap - # dividing by 10**6 because int64 is in us, while session_gap is in seconds - time_diff = X[timestamp_col].astype("int64").diff().fillna(0) // 10**6 > session_gap + # dividing by 10**9 because int64 is in ms, while session_gap is in seconds + time_diff = ( + X[timestamp_col].dt.as_unit("ns").astype("int64").diff().fillna(0) // 10**9 + > session_gap + ) if split_by: # check if the "split_by" column changes group_diff = (X[split_by].diff().fillna(0) != 0).any(axis=1) @@ -61,11 +64,10 @@ def _add_session_column_polars( X, split_by, timestamp_col, session_gap, session_column_name ): # check if the time difference between events exceeds the session gap - # setting the time unit to "us" (microseconds) for consistency with pandas - # int64 representation of timestamps, and dividing by 10**6 because session_gap is - # in seconds + # setting the time unit to "ns" (nanoseconds), and dividing by 10**9 because + # session_gap is in seconds time_diff = ( - X[timestamp_col].dt.epoch("us").diff().fill_null(0) // 10**6 > session_gap + X[timestamp_col].dt.epoch("ns").diff().fill_null(0) // 10**9 > session_gap ) if split_by: # check if the "split_by" column changes From bf58d53cf4af68634dc382f51f10a00333acfa64 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 29 May 2026 17:24:47 +0200 Subject: [PATCH 41/74] comments --- skrub/_session_encoder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 032648e95..c272a5b11 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -42,6 +42,8 @@ def _add_session_column_pandas( # to int32 # check if the time difference between events exceeds the session gap # dividing by 10**9 because int64 is in ms, while session_gap is in seconds + # as_unit("ns") is because the timestamp might be in a different unit (e.g. ms), + # and we want to make sure it's in ns for the diff to work correctly time_diff = ( X[timestamp_col].dt.as_unit("ns").astype("int64").diff().fillna(0) // 10**9 > session_gap @@ -66,6 +68,8 @@ def _add_session_column_polars( # check if the time difference between events exceeds the session gap # setting the time unit to "ns" (nanoseconds), and dividing by 10**9 because # session_gap is in seconds + # using ns for consistency with pandas, which uses ns for timestamps, and + # to avoid issues with timestamps in different units time_diff = ( X[timestamp_col].dt.epoch("ns").diff().fill_null(0) // 10**9 > session_gap ) From 2aa11ccef6cd5cd636db0618fa0beada31319075 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Fri, 29 May 2026 17:35:20 +0200 Subject: [PATCH 42/74] grr --- skrub/_session_encoder.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index c272a5b11..52c20787a 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -44,10 +44,7 @@ def _add_session_column_pandas( # dividing by 10**9 because int64 is in ms, while session_gap is in seconds # as_unit("ns") is because the timestamp might be in a different unit (e.g. ms), # and we want to make sure it's in ns for the diff to work correctly - time_diff = ( - X[timestamp_col].dt.as_unit("ns").astype("int64").diff().fillna(0) // 10**9 - > session_gap - ) + time_diff = X[timestamp_col].diff().dt.total_seconds().fillna(0) > session_gap if split_by: # check if the "split_by" column changes group_diff = (X[split_by].diff().fillna(0) != 0).any(axis=1) @@ -70,9 +67,7 @@ def _add_session_column_polars( # session_gap is in seconds # using ns for consistency with pandas, which uses ns for timestamps, and # to avoid issues with timestamps in different units - time_diff = ( - X[timestamp_col].dt.epoch("ns").diff().fill_null(0) // 10**9 > session_gap - ) + time_diff = X[timestamp_col].diff().dt.total_seconds().fill_null(0) > session_gap if split_by: # check if the "split_by" column changes group_diff = X.select( From 3c7933389853b9eca4226138f923224cde1991a4 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> Date: Mon, 1 Jun 2026 09:28:44 +0200 Subject: [PATCH 43/74] Apply suggestions from code review Co-authored-by: Gael Varoquaux --- examples/data_ops/1170_session_encoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/data_ops/1170_session_encoder.py b/examples/data_ops/1170_session_encoder.py index 3f7e452f6..f1f851691 100644 --- a/examples/data_ops/1170_session_encoder.py +++ b/examples/data_ops/1170_session_encoder.py @@ -28,8 +28,8 @@ # %% # As a sanity check, evaluate a |DummyClassifier| on the original event data -# (without session features). We expect chance-level performance -# (ROC-AUC of 0.5). +# (without session features). As it's a DummyClassifier, we expect +# chance-level performance (ROC-AUC of 0.5). from sklearn.dummy import DummyClassifier dummy = DummyClassifier(strategy="most_frequent") From 8b30ebd94d38b6d8675b3feb993692fb0c2f2677 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> Date: Mon, 1 Jun 2026 09:33:51 +0200 Subject: [PATCH 44/74] Update skrub/_session_encoder.py Co-authored-by: Gael Varoquaux --- skrub/_session_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 52c20787a..1ea3282a9 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -3,7 +3,7 @@ - a "timestamp" column, which identifies the time of an event - a "by" column or list of columns, which identifies a user - a "session_gap" value, which identifies the maximum allowed gap in seconds -between events in a session + between events in a session It returns a dataframe with the same number of rows as the input, but with the column "session_id": a unique identifier for each session, which is a combination From e561d9b216888d05d49066db60c64f1f4aea54f1 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 1 Jun 2026 14:03:37 +0200 Subject: [PATCH 45/74] improvements and changes from review --- doc/api_reference.py | 3 + examples/FIXME/0110_session_encoder.py | 63 +++++++++++++++ examples/data_ops/1170_session_encoder.py | 97 ++++++++++++++++++----- skrub/datasets/_generating.py | 7 +- 4 files changed, 143 insertions(+), 27 deletions(-) create mode 100644 examples/FIXME/0110_session_encoder.py diff --git a/doc/api_reference.py b/doc/api_reference.py index 7a91d24ae..74e5283dd 100644 --- a/doc/api_reference.py +++ b/doc/api_reference.py @@ -339,6 +339,9 @@ "datasets.get_data_dir", "datasets.make_deduplication_data", "datasets.toy_orders", + "datasets.toy_products", + "datasets.toy_cities", + "datasets.make_retail_events", ], } ], diff --git a/examples/FIXME/0110_session_encoder.py b/examples/FIXME/0110_session_encoder.py new file mode 100644 index 000000000..8153f7ea1 --- /dev/null +++ b/examples/FIXME/0110_session_encoder.py @@ -0,0 +1,63 @@ +# %% +from sklearn.dummy import DummyClassifier +from sklearn.model_selection import TimeSeriesSplit, cross_val_score + +from skrub import SessionEncoder, tabular_pipeline +from skrub.datasets import make_retail_events + +# %% +bunch = make_retail_events(n_users=20, n_events=1000, random_state=0) +# %% +X, y = bunch.X, bunch.y +# %% +X.head() +# %% +se = SessionEncoder("timestamp", split_by="user_id", session_gap=30 * 60) +# %% +X_sessions = se.fit_transform(X) +# %% +model = tabular_pipeline("classification") + +# %% +splitter = TimeSeriesSplit(n_splits=5) +scores = cross_val_score(model, X, y, cv=splitter, scoring="roc_auc") +print("ROC-AUC without session encoding:", scores.mean()) +# ROC-AUC without session encoding: 0.4758557724112403 +# %% +scores = cross_val_score(model, X_sessions, y, cv=splitter, scoring="roc_auc") +print("ROC-AUC with session encoding:", scores.mean()) +# ROC-AUC with session encoding: 0.48788976843161597 + +# %% +scores = cross_val_score( + DummyClassifier(strategy="most_frequent"), + X_sessions, + y, + cv=splitter, + scoring="roc_auc", +) +print("ROC-AUC with DummyClassifier:", scores.mean()) +# ROC-AUC with DummyClassifier: 0.5 +# %% + +from skrub import SessionEncoder + +# Step 1: add session_id +se = SessionEncoder("timestamp", split_by="user_id", session_gap=30 * 60) +X_sessions = se.fit_transform(X) + +# Step 2: compute & join session aggregates +session_agg = X_sessions.groupby("timestamp_session_id").agg( + session_has_add_to_cart=("event_type", lambda x: "add_to_cart" in x.values), + session_n_events=("event_type", "count"), + session_mean_price=("price_viewed", "mean"), + session_dominant_device=("device_type", lambda x: x.mode()[0]), +) +X_enriched = X_sessions.join(session_agg, on="timestamp_session_id") + +# Step 3: fit tabular_pipeline on enriched X +model = tabular_pipeline("classification") +# %% +scores = cross_val_score(model, X_enriched, y, cv=splitter, scoring="roc_auc") +print("ROC-AUC with session encoding:", scores.mean()) +# %% diff --git a/examples/data_ops/1170_session_encoder.py b/examples/data_ops/1170_session_encoder.py index f1f851691..69b9bdadd 100644 --- a/examples/data_ops/1170_session_encoder.py +++ b/examples/data_ops/1170_session_encoder.py @@ -1,17 +1,43 @@ """ -Use SessionEncoder in DataOps to predict purchases -================================================== -This example shows how to use |SessionEncoder| in a skrub DataOps workflow. + +.. |SessionEncoder| replace:: :class:`~skrub.SessionEncoder` +.. |make_retail_events| replace:: :func:`~skrub.datasets.make_retail_events` +.. |tabular_pipeline| replace:: :func:`~skrub.tabular_pipeline` +.. |skrub.X| replace:: :func:`~skrub.X` +.. |skrub.y| replace:: :func:`~skrub.y` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |DummyClassifier| replace:: :class:`~sklearn.dummy.DummyClassifier` +.. |TimeSeriesSplit| replace:: :class:`~sklearn.model_selection.TimeSeriesSplit` +.. |cross_validate| replace:: :func:`~skrub.cross_validate` +.. |apply_func| replace:: :func:`~skrub.DataOp.skb.apply_func` + +Sessions in time-based data: Using SessionEncoder in rich DataOps pipeline +========================================================================== + +This example shows how to use |SessionEncoder| in a skrub DataOps workflow to +create session-level features (sessionization) for conversion prediction, that is +predicting whether a user session will eventually lead to a purchase. + +**What is sessionization?** + +Sessionization is the process of grouping a sequence of events (like user +interactions) into meaningful sessions. A session typically starts fresh or +after a period of inactivity. For example, in an online retail context, you +might define a new session whenever more than 30 minutes pass with no activity +from a user. This allows you to extract session-level features (like the total +number of events in a session or the dominant device type used) which often have +greater predictive power than raw individual events. + We will: -1. Generate synthetic retail event data -2. Build a baseline classifier on raw event-level features -3. Add session-level and historical features +1. Use |make_retail_events| to generate synthetic retail event data +2. Build a baseline classifier on raw event-level features with the |tabular_pipeline| +3. Add session-level and historical features with |SessionEncoder| 4. Train the same model again and compare ROC-AUC -The data comes from |make_retail_events| and includes columns such as event type, -device type, viewed price, and timestamp. The target is binary: whether the -session eventually contains a purchase event. +The data includes columns such as event type, device type, viewed price, and +timestamp. The target is binary: whether the session eventually contains a +purchase event or not. """ # %% @@ -19,16 +45,16 @@ from skrub.datasets import make_retail_events # %% -events = make_retail_events(n_users=20, n_events=5000, random_state=0) -# %% -# Mark feature and target data with |skrub.X| and |skrub.y| so they can be used +# We begin by generating the data with |make_retail_events| and marking feature +# and target data with |skrub.X| and |skrub.y| so they can be used # in a DataOps workflow. +events = make_retail_events(n_users=20, n_events=5000, random_state=0) X, y = skrub.X(events.X), skrub.y(events.y) - +X # %% # As a sanity check, evaluate a |DummyClassifier| on the original event data -# (without session features). As it's a DummyClassifier, we expect +# (without session features). Since it's a DummyClassifier, we expect # chance-level performance (ROC-AUC of 0.5). from sklearn.dummy import DummyClassifier @@ -36,8 +62,8 @@ dummy_pred = X.skb.apply(dummy, y=y) dummy_learner = dummy_pred.skb.make_learner() # %% -# Because this is temporal data, we use a time-aware CV strategy. -# We reuse the same splitter for all evaluations. +# Because this is temporal data, we use a time-aware CV strategy with +# |TimeSeriesSplit| to avoid leakage. We reuse the same splitter for all evaluations. from sklearn.model_selection import TimeSeriesSplit splitter = TimeSeriesSplit(n_splits=5) @@ -47,7 +73,14 @@ print(f"ROC-AUC with DummyClassifier: {dummy_results['test_score'].mean():.3f}") # %% -# Try a real model with |tabular_pipeline|, first on raw event-level data. +# First attempt: training a model without using session-level features +# -------------------------------------------------------------------- +# We first use the |tabular_pipeline| on raw event-level data, without any session +# encoding or aggregation. This serves as a baseline to compare against the enriched +# model later. +# Remember that the |tabular_pipeline| will automatically add a |TableVectorizer| +# to perform feature engineering, so the model can still learn from the raw event +# features. However, it won't be able to directly capture session-level patterns. from skrub import tabular_pipeline model = tabular_pipeline("classification") @@ -60,23 +93,34 @@ print(f"ROC-AUC without session encoding: {results['test_score'].mean():.3f}") # %% +# The model is not performing much better than the DummyClassifier, which suggests +# that raw event-level features are not sufficient for good conversion prediction. # This baseline is limited because it cannot directly use session-level behavior # (for example, whether "add_to_cart" happened in the same session). # -# Next, create sessions with |SessionEncoder|. We define boundaries from -# ``timestamp`` within each ``user_id``. A new session starts after more than -# 30 minutes of inactivity (``session_gap`` is in seconds). +# %% +# A better approach: session encoding and aggregation +# ------------------------------------------------------ +# Next, we use the |SessionEncoder| to create session-level features that we can +# aggregate over. We define a session boundary as "a user has been inactive for +# more than 30 minutes". The |SessionEncoder| will create a new column +# ``timestamp_session_id`` that assigns a unique session ID to each session detected. +# The parameter ``session_gap=30 * 60`` specifies the inactivity threshold in +# seconds (30 minutes). + # %% from skrub import SessionEncoder se = SessionEncoder("timestamp", split_by="user_id", session_gap=30 * 60) X_sessions = X.skb.apply(se) +X_sessions # %% # ``timestamp_session_id`` identifies the session of each event. # We use it to compute session-level aggregates and join them back to event-level rows. # # We will compute the following session-level features: +# # - ``session_has_add_to_cart``: whether the session includes at least one "add_to_cart" # event # - ``session_n_events``: the total number of events in the session @@ -87,6 +131,7 @@ # - ``is_last_event_in_session``: whether the event is the last event in its session # # We also compute one user-level historical feature after sorting by timestamp: +# # - ``time_since_last_event``: the time in seconds since the previous event for the # same user (NaN for the first event of each user) @@ -112,6 +157,11 @@ def compute_session_features(df): return df +# %% +# We want to compute the historical feature ``time_since_last_event``, but we need +# to sort by timestamp first to ensure that the "previous event" is correctly +# defined. After computing the feature, we restore the original row order to avoid +# any issues with downstream processing that might expect the original order. def compute_historical_features(df): # Preserve input row order after timestamp-based computations. df["_row_order"] = df.index @@ -123,6 +173,9 @@ def compute_historical_features(df): return df +# %% +# We use |apply_func| to apply these feature engineering functions to the data +# with session IDs. X_enriched = X_sessions.skb.apply_func(compute_session_features) X_enriched = X_enriched.skb.apply_func(compute_historical_features) X_enriched @@ -146,7 +199,7 @@ def compute_historical_features(df): # # In DataOps, these aggregations are evaluated with temporal ordering in mind, # which helps prevent leakage: features for an event are computed only from data -# available up to that event timestamp. +# available up to that event timestamp (provided that the correct splitter is used). # -# This example focuses on SessionEncoder usage, so we intentionally keep modeling +# This example focuses on |SessionEncoder| usage, so we intentionally keep modeling # simple (no hyperparameter tuning and only a small set of engineered features). diff --git a/skrub/datasets/_generating.py b/skrub/datasets/_generating.py index 2fc479d2f..defbe3657 100644 --- a/skrub/datasets/_generating.py +++ b/skrub/datasets/_generating.py @@ -339,11 +339,8 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): - ``X`` : :class:`~pandas.DataFrame` with columns: - - ``user_id`` : str — user identifier, suitable for - ``SessionEncoder(split_by="user_id", ...)``. - - ``timestamp`` : :class:`~pandas.Timestamp` — event time, suitable - for ``SessionEncoder(timestamp_col="timestamp", ...)`` and - :class:`~skrub.DatetimeEncoder`. + - ``user_id`` : str — user identifier. + - ``timestamp`` : :class:`~pandas.Timestamp` — event time. - ``device_type`` : str — one of ``"mobile"``, ``"desktop"``, ``"tablet"``. - ``page_category`` : str — one of ``"electronics"``, ``"fashion"``, From d9012dc09bf44bce077d1c52ef8af3e1ed5a0e8f Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 1 Jun 2026 15:26:07 +0200 Subject: [PATCH 46/74] more improvements --- examples/data_ops/1170_session_encoder.py | 77 +++++++++++++---------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/examples/data_ops/1170_session_encoder.py b/examples/data_ops/1170_session_encoder.py index 69b9bdadd..c506c023e 100644 --- a/examples/data_ops/1170_session_encoder.py +++ b/examples/data_ops/1170_session_encoder.py @@ -18,15 +18,17 @@ create session-level features (sessionization) for conversion prediction, that is predicting whether a user session will eventually lead to a purchase. -**What is sessionization?** +.. note:: -Sessionization is the process of grouping a sequence of events (like user -interactions) into meaningful sessions. A session typically starts fresh or -after a period of inactivity. For example, in an online retail context, you -might define a new session whenever more than 30 minutes pass with no activity -from a user. This allows you to extract session-level features (like the total -number of events in a session or the dominant device type used) which often have -greater predictive power than raw individual events. + **What is sessionization?** + + Sessionization is the process of grouping a sequence of events (like user + interactions) into meaningful sessions. A session typically starts fresh or + after a period of inactivity. For example, in an online retail context, you + might define a new session whenever more than 30 minutes pass with no activity + from a user. This allows you to extract session-level features (like the total + number of events in a session or the dominant device type used) which often have + greater predictive power than raw individual events. We will: @@ -41,32 +43,35 @@ """ # %% -import skrub -from skrub.datasets import make_retail_events +# %% +# Since this is temporal data, we use a time-aware CV strategy with +# |TimeSeriesSplit| to avoid leakage. We reuse the same splitter for all evaluations. +from sklearn.model_selection import TimeSeriesSplit + +splitter = TimeSeriesSplit(n_splits=5) # %% # We begin by generating the data with |make_retail_events| and marking feature # and target data with |skrub.X| and |skrub.y| so they can be used # in a DataOps workflow. +import skrub +from skrub.datasets import make_retail_events + events = make_retail_events(n_users=20, n_events=5000, random_state=0) X, y = skrub.X(events.X), skrub.y(events.y) X # %% -# As a sanity check, evaluate a |DummyClassifier| on the original event data -# (without session features). Since it's a DummyClassifier, we expect +# Sanity check: evaluate a DummyClassifier on raw event data +# --------------------------------------------------------------- +# We begin by evaluating a |DummyClassifier| on the original event data +# (without session features). Since it's a |DummyClassifier|, we expect # chance-level performance (ROC-AUC of 0.5). from sklearn.dummy import DummyClassifier dummy = DummyClassifier(strategy="most_frequent") dummy_pred = X.skb.apply(dummy, y=y) dummy_learner = dummy_pred.skb.make_learner() -# %% -# Because this is temporal data, we use a time-aware CV strategy with -# |TimeSeriesSplit| to avoid leakage. We reuse the same splitter for all evaluations. -from sklearn.model_selection import TimeSeriesSplit - -splitter = TimeSeriesSplit(n_splits=5) dummy_results = skrub.cross_validate( dummy_learner, environment=dummy_pred.skb.get_data(), cv=splitter, scoring="roc_auc" ) @@ -97,7 +102,7 @@ # that raw event-level features are not sufficient for good conversion prediction. # This baseline is limited because it cannot directly use session-level behavior # (for example, whether "add_to_cart" happened in the same session). -# + # %% # A better approach: session encoding and aggregation # ------------------------------------------------------ @@ -119,21 +124,24 @@ # ``timestamp_session_id`` identifies the session of each event. # We use it to compute session-level aggregates and join them back to event-level rows. # -# We will compute the following session-level features: +# .. admonition:: Session-level feature engineering +# :collapsible: closed +# +# We will compute the following session-level features: # -# - ``session_has_add_to_cart``: whether the session includes at least one "add_to_cart" -# event -# - ``session_n_events``: the total number of events in the session -# - ``session_mean_price``: the mean price viewed during the session -# - ``session_dominant_device``: the most frequently used device type in the session -# - ``event_rank_in_session``: the rank of the event within its session (0 for the -# first event, 1 for the second, etc.) -# - ``is_last_event_in_session``: whether the event is the last event in its session +# - ``session_has_add_to_cart``: whether the session includes at least one +# "add_to_cart" event +# - ``session_n_events``: the total number of events in the session +# - ``session_mean_price``: the mean price viewed during the session +# - ``session_dominant_device``: the most frequently used device type in the session +# - ``event_rank_in_session``: the rank of the event within its session (0 for the +# first event, 1 for the second, etc.) +# - ``is_last_event_in_session``: whether the event is the last event in its session # -# We also compute one user-level historical feature after sorting by timestamp: +# We also compute one user-level historical feature after sorting by timestamp: # -# - ``time_since_last_event``: the time in seconds since the previous event for the -# same user (NaN for the first event of each user) +# - ``time_since_last_event``: the time in seconds since the previous event for the +# same user (NaN for the first event of each user) def most_frequent(series): @@ -194,9 +202,12 @@ def compute_historical_features(df): print(f"ROC-AUC with session encoding: {results_enriched['test_score'].mean():.3f}") # %% -# The enriched model should outperform the baseline, showing the value of +# The enriched model clearly outperforms the baseline, showing the value of # session-level context for conversion prediction. -# + +# %% +# Discussion +# ----------- # In DataOps, these aggregations are evaluated with temporal ordering in mind, # which helps prevent leakage: features for an event are computed only from data # available up to that event timestamp (provided that the correct splitter is used). From d73c45b378d382aa297119edaf7e98164067b90a Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 1 Jun 2026 16:00:35 +0200 Subject: [PATCH 47/74] adding plain example --- examples/0110_session_encoder.py | 172 ++++++++++++++++++++++ examples/FIXME/0110_session_encoder.py | 63 -------- examples/data_ops/1170_session_encoder.py | 30 ---- 3 files changed, 172 insertions(+), 93 deletions(-) create mode 100644 examples/0110_session_encoder.py delete mode 100644 examples/FIXME/0110_session_encoder.py diff --git a/examples/0110_session_encoder.py b/examples/0110_session_encoder.py new file mode 100644 index 000000000..65f51b035 --- /dev/null +++ b/examples/0110_session_encoder.py @@ -0,0 +1,172 @@ +""" + +.. |SessionEncoder| replace:: :class:`~skrub.SessionEncoder` +.. |make_retail_events| replace:: :func:`~skrub.datasets.make_retail_events` +.. |tabular_pipeline| replace:: :func:`~skrub.tabular_pipeline` +.. |skrub.X| replace:: :func:`~skrub.X` +.. |skrub.y| replace:: :func:`~skrub.y` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |DummyClassifier| replace:: :class:`~sklearn.dummy.DummyClassifier` +.. |TimeSeriesSplit| replace:: :class:`~sklearn.model_selection.TimeSeriesSplit` +.. |cross_validate| replace:: :func:`~skrub.cross_validate` +.. |apply_func| replace:: :func:`~skrub.DataOp.skb.apply_func` + +Sessions in time-based data: Predicting conversion with the |SessionEncoder| +========================================================================== + +This example shows how to use |SessionEncoder| in a scikit-learn pipeline to +create session-level features (sessionization) for conversion prediction, that is +predicting whether a user session will eventually lead to a purchase. + +.. note:: + + **What is sessionization?** + + Sessionization is the process of grouping a sequence of events (like user + interactions) into meaningful sessions. A session typically starts fresh or + after a period of inactivity. For example, in an online retail context, you + might define a new session whenever more than 30 minutes pass with no activity + from a user. This allows you to extract session-level features (like the total + number of events in a session or the dominant device type used) which often have + greater predictive power than raw individual events. + +We will: + +1. Use |make_retail_events| to generate synthetic retail event data +2. Build a baseline classifier on raw event-level features with the |tabular_pipeline| +3. Add session-level and historical features with |SessionEncoder| +4. Train the same model again and compare ROC-AUC + +The data includes columns such as event type, device type, viewed price, and +timestamp. The target is binary: whether the session eventually contains a +purchase event or not. + + +.. note:: + +A version of this example that uses the skrub DataOps workflow instead of a +scikit-learn pipeline is available in :ref:`examples/data_ops/1170_session_encoder`. +""" + +# %% +# Since this is temporal data, we use a time-aware CV strategy with +# |TimeSeriesSplit| to avoid leakage. We reuse the same splitter for all evaluations. +from sklearn.model_selection import TimeSeriesSplit + +splitter = TimeSeriesSplit(n_splits=5) +# %% +# We begin by generating the data with |make_retail_events| and marking feature +# and target data with |skrub.X| and |skrub.y| so they can be used +# in a DataOps workflow. + +from skrub.datasets import make_retail_events + +events = make_retail_events(n_users=20, n_events=5000, random_state=0) +X, y = events.X, events.y +X +# %% +# Sanity check: evaluate a DummyClassifier on raw event data +# --------------------------------------------------------------- +# We begin by evaluating a |DummyClassifier| on the original event data +# (without session features). Since it's a |DummyClassifier|, we expect +# chance-level performance (ROC-AUC of 0.5). +from sklearn.dummy import DummyClassifier +from sklearn.model_selection import cross_val_score + +dummy = DummyClassifier(strategy="most_frequent") + +scores = cross_val_score(dummy, X, y, cv=splitter, scoring="roc_auc") +print(f"ROC-AUC with DummyClassifier: {scores.mean():.3f}") + +# %% +# First attempt: training a model without using session-level features +# -------------------------------------------------------------------- +# We first use the |tabular_pipeline| on raw event-level data, without any session +# encoding or aggregation. This serves as a baseline to compare against the enriched +# model later. +# Remember that the |tabular_pipeline| will automatically add a |TableVectorizer| +# to perform feature engineering, so the model can still learn from the raw event +# features. However, it won't be able to directly capture session-level patterns. +from skrub import tabular_pipeline + +model = tabular_pipeline("classification") + +scores = cross_val_score(model, X, y, cv=splitter, scoring="roc_auc") +print(f"ROC-AUC without session encoding: {scores.mean():.3f}") +# %% +# The model is not performing much better than the DummyClassifier, which suggests +# that raw event-level features are not sufficient for good conversion prediction. +# This baseline is limited because it cannot directly use session-level behavior +# (for example, whether "add_to_cart" happened in the same session). + +# %% +# A better approach: session encoding and aggregation +# ------------------------------------------------------ +# Next, we use the |SessionEncoder| to create session-level features that we can +# aggregate over. We define a session boundary as "a user has been inactive for +# more than 30 minutes". The |SessionEncoder| will create a new column +# ``timestamp_session_id`` that assigns a unique session ID to each session detected. +# The parameter ``session_gap=30 * 60`` specifies the inactivity threshold in +# seconds (30 minutes). +# +# Note that session-based features involve aggregations, which must be performed +# only on the training data within each fold to avoid leakage. In a scikit-learn +# pipeline, we can achieve this by using |SessionEncoder| followed by a custom +# transformer that computes session aggregates, and ensuring that the pipeline is +# properly fitted within each fold of cross-validation. +# %% +from skrub import SessionEncoder, tabular_pipeline + +se = SessionEncoder("timestamp", split_by="user_id", session_gap=30 * 60) +# Here we fit the SessionEncoder on the entire dataset for demonstration purposes +X_sessions = se.fit_transform(X) +X_sessions.head() + +# %% +# To avoid data leakage and maintain a clean pipeline, we can create a custom +# transformer that computes session-level aggregates within a scikit-learn pipeline. +# This transformer will be fitted and applied separately within each fold of +# cross-validation, ensuring that session features are computed only on the training +# data of each fold. + +from sklearn.base import BaseEstimator, TransformerMixin + + +class SessionAggregator(BaseEstimator, TransformerMixin): + def fit(self, X, y=None): + return self + + def transform(self, X): + # Compute session-level aggregates + session_agg = X.groupby("timestamp_session_id").agg( + session_has_add_to_cart=("event_type", lambda x: "add_to_cart" in x.values), + session_n_events=("event_type", "count"), + session_mean_price=("price_viewed", "mean"), + session_dominant_device=("device_type", lambda x: x.mode()[0]), + ) + # Join back to the original data + return X.join(session_agg, on="timestamp_session_id") + + +# %% +# Then, we create a pipeline that includes the |SessionEncoder|, our custom +# ``SessionAggregator``, and the |tabular_pipeline| for classification. This +# pipeline will be used in cross-validation to evaluate the model +# with session features. +from sklearn.pipeline import make_pipeline + +model = make_pipeline(se, SessionAggregator(), tabular_pipeline("classification")) +scores = cross_val_score(model, X, y, cv=splitter, scoring="roc_auc") +print("ROC-AUC with session encoding:", scores.mean()) + +# %% +# As expected, the model with session encoding performs much better than the baseline +# without session features, demonstrating the value of sessionization for conversion +# prediction. +# +# The fact that we are working with aggregation means that it was necessary to +# create a custom transformer to compute session-level features. This situation +# can be avoided by using the skrub DataOps workflow, which allows for more +# flexible data transformations without needing to fit everything within a +# scikit-learn pipeline. For an example of how to do this with DataOps, see +# :ref:`examples/data_ops/1170_session_encoder`. diff --git a/examples/FIXME/0110_session_encoder.py b/examples/FIXME/0110_session_encoder.py deleted file mode 100644 index 8153f7ea1..000000000 --- a/examples/FIXME/0110_session_encoder.py +++ /dev/null @@ -1,63 +0,0 @@ -# %% -from sklearn.dummy import DummyClassifier -from sklearn.model_selection import TimeSeriesSplit, cross_val_score - -from skrub import SessionEncoder, tabular_pipeline -from skrub.datasets import make_retail_events - -# %% -bunch = make_retail_events(n_users=20, n_events=1000, random_state=0) -# %% -X, y = bunch.X, bunch.y -# %% -X.head() -# %% -se = SessionEncoder("timestamp", split_by="user_id", session_gap=30 * 60) -# %% -X_sessions = se.fit_transform(X) -# %% -model = tabular_pipeline("classification") - -# %% -splitter = TimeSeriesSplit(n_splits=5) -scores = cross_val_score(model, X, y, cv=splitter, scoring="roc_auc") -print("ROC-AUC without session encoding:", scores.mean()) -# ROC-AUC without session encoding: 0.4758557724112403 -# %% -scores = cross_val_score(model, X_sessions, y, cv=splitter, scoring="roc_auc") -print("ROC-AUC with session encoding:", scores.mean()) -# ROC-AUC with session encoding: 0.48788976843161597 - -# %% -scores = cross_val_score( - DummyClassifier(strategy="most_frequent"), - X_sessions, - y, - cv=splitter, - scoring="roc_auc", -) -print("ROC-AUC with DummyClassifier:", scores.mean()) -# ROC-AUC with DummyClassifier: 0.5 -# %% - -from skrub import SessionEncoder - -# Step 1: add session_id -se = SessionEncoder("timestamp", split_by="user_id", session_gap=30 * 60) -X_sessions = se.fit_transform(X) - -# Step 2: compute & join session aggregates -session_agg = X_sessions.groupby("timestamp_session_id").agg( - session_has_add_to_cart=("event_type", lambda x: "add_to_cart" in x.values), - session_n_events=("event_type", "count"), - session_mean_price=("price_viewed", "mean"), - session_dominant_device=("device_type", lambda x: x.mode()[0]), -) -X_enriched = X_sessions.join(session_agg, on="timestamp_session_id") - -# Step 3: fit tabular_pipeline on enriched X -model = tabular_pipeline("classification") -# %% -scores = cross_val_score(model, X_enriched, y, cv=splitter, scoring="roc_auc") -print("ROC-AUC with session encoding:", scores.mean()) -# %% diff --git a/examples/data_ops/1170_session_encoder.py b/examples/data_ops/1170_session_encoder.py index c506c023e..71d356746 100644 --- a/examples/data_ops/1170_session_encoder.py +++ b/examples/data_ops/1170_session_encoder.py @@ -42,7 +42,6 @@ purchase event or not. """ -# %% # %% # Since this is temporal data, we use a time-aware CV strategy with # |TimeSeriesSplit| to avoid leakage. We reuse the same splitter for all evaluations. @@ -134,14 +133,6 @@ # - ``session_n_events``: the total number of events in the session # - ``session_mean_price``: the mean price viewed during the session # - ``session_dominant_device``: the most frequently used device type in the session -# - ``event_rank_in_session``: the rank of the event within its session (0 for the -# first event, 1 for the second, etc.) -# - ``is_last_event_in_session``: whether the event is the last event in its session -# -# We also compute one user-level historical feature after sorting by timestamp: -# -# - ``time_since_last_event``: the time in seconds since the previous event for the -# same user (NaN for the first event of each user) def most_frequent(series): @@ -158,26 +149,6 @@ def compute_session_features(df): session_dominant_device=("device_type", most_frequent), ) df = df.join(session_agg, on="timestamp_session_id") - grouped = df.groupby("timestamp_session_id") - df["event_rank_in_session"] = grouped.cumcount() - session_sizes = grouped["event_type"].transform("size") - df["is_last_event_in_session"] = df["event_rank_in_session"].eq(session_sizes - 1) - return df - - -# %% -# We want to compute the historical feature ``time_since_last_event``, but we need -# to sort by timestamp first to ensure that the "previous event" is correctly -# defined. After computing the feature, we restore the original row order to avoid -# any issues with downstream processing that might expect the original order. -def compute_historical_features(df): - # Preserve input row order after timestamp-based computations. - df["_row_order"] = df.index - df = df.sort_values("timestamp") - df["time_since_last_event"] = ( - df.groupby("user_id")["timestamp"].diff().dt.total_seconds() - ) - df = df.sort_values("_row_order").drop(columns="_row_order") return df @@ -185,7 +156,6 @@ def compute_historical_features(df): # We use |apply_func| to apply these feature engineering functions to the data # with session IDs. X_enriched = X_sessions.skb.apply_func(compute_session_features) -X_enriched = X_enriched.skb.apply_func(compute_historical_features) X_enriched # %% # Now we can train the same model on the enriched data with session-level features From b77a59ed6796108df9cad23b64da80d5b1fb8696 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 1 Jun 2026 16:04:56 +0200 Subject: [PATCH 48/74] rewording --- skrub/_session_encoder.py | 51 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 1ea3282a9..b52b75f0e 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -47,10 +47,10 @@ def _add_session_column_pandas( time_diff = X[timestamp_col].diff().dt.total_seconds().fillna(0) > session_gap if split_by: # check if the "split_by" column changes - group_diff = (X[split_by].diff().fillna(0) != 0).any(axis=1) + has_split_change = (X[split_by].diff().fillna(0) != 0).any(axis=1) # a new session starts if either the "split_by" column changes or the time # gap is exceeded - is_new_session = group_diff | time_diff + is_new_session = has_split_change | time_diff else: is_new_session = time_diff # Compute cumulative sum of is_new_session to create session IDs @@ -70,12 +70,12 @@ def _add_session_column_polars( time_diff = X[timestamp_col].diff().dt.total_seconds().fill_null(0) > session_gap if split_by: # check if the "split_by" column changes - group_diff = X.select( + has_split_change = X.select( pl.any_horizontal(pl.col(split_by).diff().fill_null(0) != 0) ).to_series() # a new session starts if either the "split_by" column changes or the time # gap is exceeded - is_new_session = group_diff | time_diff + is_new_session = has_split_change | time_diff else: is_new_session = time_diff # Add session_id by computing cumulative sum of is_new_session @@ -124,10 +124,13 @@ class SessionEncoder(TransformerMixin, BaseEstimator): is used to determine the start and end of a session. split_by : optional[str, list[str]], default=None - The name of the column, or list of columns, to group by. This parameter - is used to group events into sessions by, for example, user. If not - provided, sessions are detected based on the time gap between events, and all - events are considered to belong to the same user (or group). + The name of the column, or list of columns, to use to define sessions. + A session boundary is created when the value in any of these columns + changes, or when the time gap between events exceeds ``session_gap``. + This is typically a user identifier column, but it can also be used to define + sessions by other groupings (e.g. user and device type). + If not provided, sessions are detected based on the time gap between events, + and all events are considered to belong to the same user (or group). session_gap : int, default=1800 The maximum gap (in seconds) between events in a session. If the gap @@ -235,7 +238,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): - User 1 on "mobile" has session 1 (different device, so separate session). - User 2 on "mobile" has session 2 (different user). - Note that sessions are defined by sorting over the grouping columns and then + Note that sessions are defined by sorting over the ``split_by`` columns and then by the timestamp: this is why, while the "desktop" session of User 1 starts after their "mobile" session, it has session id ``0`` since in alphabetical ordering "desktop" is first. @@ -244,12 +247,12 @@ class SessionEncoder(TransformerMixin, BaseEstimator): sessions are separated only by time gaps. This is useful for analyzing a single timeseries or events that don't have a user dimension: - >>> encoder_no_group = SessionEncoder( + >>> encoder_no_split = SessionEncoder( ... split_by=None, ... timestamp_col='timestamp', ... session_gap=30 * 60 ... ) - >>> data_no_group = { + >>> data_no_split = { ... 'timestamp': [ ... pd.Timestamp('2024-01-01 10:00:00'), ... pd.Timestamp('2024-01-01 10:10:00'), # 10 min gap @@ -259,9 +262,9 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... ], ... 'event_type': ['start', 'action', 'action', 'restart', 'action'] ... } - >>> df_no_group = pd.DataFrame(data_no_group) - >>> result_no_group = encoder_no_group.fit_transform(df_no_group) - >>> result_no_group + >>> df_no_split = pd.DataFrame(data_no_split) + >>> result_no_split = encoder_no_split.fit_transform(df_no_split) + >>> result_no_split timestamp event_type timestamp_session_id 0 2024-01-01 10:00:00 start 0 1 2024-01-01 10:10:00 action 0 @@ -399,15 +402,15 @@ def fit_transform(self, X, y=None): if cols_to_remove := [ _ for _ in self.all_inputs_ - if _ not in self._group_by_columns + [self.timestamp_col] + if _ not in self._split_by_columns + [self.timestamp_col] ]: X_selected = sbd.drop_columns(X, s.cols(*cols_to_remove).expand(X)) else: X_selected = X - # sort the input dataframe by the "group_by" and "timestamp" columns + # sort the input dataframe by the "split_by" and "timestamp" columns sort_by = ( - self._group_by_columns + [self.timestamp_col] + self._split_by_columns + [self.timestamp_col] if self.split_by is not None else [self.timestamp_col] ) @@ -422,9 +425,9 @@ def fit_transform(self, X, y=None): # Reordering rows back to the original order X_result = sbd.sort(X_with_session_id, by=row_order_col) - # drop the factorized "group_by" columns if the original "group_by" + # drop the factorized "split_by" columns if the original "split_by" # columns were not numeric, and the column used to reorder - to_drop = [col for col in factorized_by if col not in self._group_by_columns] + to_drop = [col for col in factorized_by if col not in self._split_by_columns] to_drop += [row_order_col] X_result = sbd.drop_columns(X_result, to_drop) @@ -465,17 +468,17 @@ def _check_input_dataframe(self): f"Column '{self.timestamp_col}' not found in input dataframe" ) # check that the required columns are present in the input dataframe - self._group_by_columns = [] + self._split_by_columns = [] if self.split_by is not None: if isinstance(self.split_by, str): - self._group_by_columns = [self.split_by] + self._split_by_columns = [self.split_by] elif isinstance(self.split_by, Iterable) and not isinstance( self.split_by, str ): - self._group_by_columns = list(self.split_by) + self._split_by_columns = list(self.split_by) else: raise TypeError("split_by must be a string, a list of strings, or None") - for col in self._group_by_columns: + for col in self._split_by_columns: if col not in self.all_inputs_: raise ValueError(f"Column '{col}' not found in input dataframe") @@ -491,7 +494,7 @@ def _factorize_columns(self, X): f"{col}_factorized_skrub_{random_string()}": _factorize_column(X, col) if not sbd.is_numeric(X[col]) else X[col] - for col in self._group_by_columns + for col in self._split_by_columns } X_factorized = sbd.with_columns(X, **factorized_columns) From 8ee2bee1063dc15432bc846546485602f47b7493 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 1 Jun 2026 16:20:36 +0200 Subject: [PATCH 49/74] cleanup docstring --- skrub/_session_encoder.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index b52b75f0e..0711fa72f 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -159,7 +159,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> from skrub import SessionEncoder >>> from datetime import datetime, timedelta >>> encoder = SessionEncoder( - ... split_by='user_id', timestamp_col='timestamp', session_gap=30 * 60 + ... split_by='user_id', timestamp_col='timestamp' ... ) >>> data = { ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], @@ -205,7 +205,6 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> encoder_multi = SessionEncoder( ... split_by=['user_id', 'device_id'], ... timestamp_col='timestamp', - ... session_gap=30 * 60 ... ) >>> data_multi = { ... 'user_id': [1, 1, 1, 1, 2, 2], @@ -250,7 +249,6 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> encoder_no_split = SessionEncoder( ... split_by=None, ... timestamp_col='timestamp', - ... session_gap=30 * 60 ... ) >>> data_no_split = { ... 'timestamp': [ @@ -278,6 +276,25 @@ class SessionEncoder(TransformerMixin, BaseEstimator): - The event at 11:00 starts a new session 1 (45 min gap > 30 min). - The event at 11:10 continues session 1 (10 min gap < 30 min). + It is possible to change the duration of the session gap by setting the + ``session_gap`` parameter. For example, we can set it to 5 minutes (300 seconds) + instead of the default 30 minutes, and this will change the session assignments + accordingly: + + >>> encoder_new_gap = SessionEncoder( + ... split_by=None, + ... timestamp_col='timestamp', + ... session_gap=300 + ... ) + >>> result_new_gap = encoder_new_gap.fit_transform(df_no_split) + >>> result_new_gap + timestamp event_type timestamp_session_id + 0 2024-01-01 10:00:00 start 0 + 1 2024-01-01 10:10:00 action 1 + 2 2024-01-01 10:15:00 action 1 + 3 2024-01-01 11:00:00 restart 2 + 4 2024-01-01 11:10:00 action 3 + It is also possible to change the suffix that is added at the end of the session ID column via the "suffix" parameter. This is useful, for example, if you want to add sessions based on different groupings or intervals: From 981134e23300b22f1c34f050f8a44dd32db54eb8 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> Date: Tue, 2 Jun 2026 11:52:45 +0200 Subject: [PATCH 50/74] Update examples/data_ops/1170_session_encoder.py Co-authored-by: Gael Varoquaux --- examples/data_ops/1170_session_encoder.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/data_ops/1170_session_encoder.py b/examples/data_ops/1170_session_encoder.py index 71d356746..95fc73dfa 100644 --- a/examples/data_ops/1170_session_encoder.py +++ b/examples/data_ops/1170_session_encoder.py @@ -18,9 +18,7 @@ create session-level features (sessionization) for conversion prediction, that is predicting whether a user session will eventually lead to a purchase. -.. note:: - - **What is sessionization?** +.. topic:: What is sessionization? Sessionization is the process of grouping a sequence of events (like user interactions) into meaningful sessions. A session typically starts fresh or From 6b03b14c01c203ccc877240a049cc3a7642fde78 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 2 Jun 2026 14:54:47 +0200 Subject: [PATCH 51/74] fixing timezone --- skrub/datasets/_generating.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/datasets/_generating.py b/skrub/datasets/_generating.py index defbe3657..61211042c 100644 --- a/skrub/datasets/_generating.py +++ b/skrub/datasets/_generating.py @@ -5,9 +5,9 @@ from __future__ import annotations +import datetime import numbers import string -from datetime import datetime import numpy as np import pandas as pd @@ -388,7 +388,7 @@ def make_retail_events(n_users=200, n_events=5000, random_state=None): # 2. Space session starts by Exponential gaps >> session_gap, spread # across a 90-day window. # 3. Within each session, place events with Exponential(90 s) gaps. - base_time = pd.Timestamp("2024-01-01") + base_time = datetime.datetime(2024, 1, 1, 0, 0, tzinfo=datetime.timezone.utc) total_window_s = 90 * 24 * 3600 # 90 days within_session_mean_s = 90.0 # ~1.5 min between events inside a session min_between_session_s = 2 * 3600 # 2 h minimum gap — well above session_gap From 89c79b70907bb922d7123ed2e6f5d93cc04548f8 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 2 Jun 2026 14:55:04 +0200 Subject: [PATCH 52/74] doc cleanup --- examples/0110_session_encoder.py | 16 ++-------------- .../{data_ops => FIXME}/1170_session_encoder.py | 0 2 files changed, 2 insertions(+), 14 deletions(-) rename examples/{data_ops => FIXME}/1170_session_encoder.py (100%) diff --git a/examples/0110_session_encoder.py b/examples/0110_session_encoder.py index 65f51b035..ceaf942d4 100644 --- a/examples/0110_session_encoder.py +++ b/examples/0110_session_encoder.py @@ -3,13 +3,9 @@ .. |SessionEncoder| replace:: :class:`~skrub.SessionEncoder` .. |make_retail_events| replace:: :func:`~skrub.datasets.make_retail_events` .. |tabular_pipeline| replace:: :func:`~skrub.tabular_pipeline` -.. |skrub.X| replace:: :func:`~skrub.X` -.. |skrub.y| replace:: :func:`~skrub.y` .. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` .. |DummyClassifier| replace:: :class:`~sklearn.dummy.DummyClassifier` .. |TimeSeriesSplit| replace:: :class:`~sklearn.model_selection.TimeSeriesSplit` -.. |cross_validate| replace:: :func:`~skrub.cross_validate` -.. |apply_func| replace:: :func:`~skrub.DataOp.skb.apply_func` Sessions in time-based data: Predicting conversion with the |SessionEncoder| ========================================================================== @@ -18,9 +14,7 @@ create session-level features (sessionization) for conversion prediction, that is predicting whether a user session will eventually lead to a purchase. -.. note:: - - **What is sessionization?** +.. topic:: What is sessionization? Sessionization is the process of grouping a sequence of events (like user interactions) into meaningful sessions. A session typically starts fresh or @@ -41,11 +35,6 @@ timestamp. The target is binary: whether the session eventually contains a purchase event or not. - -.. note:: - -A version of this example that uses the skrub DataOps workflow instead of a -scikit-learn pipeline is available in :ref:`examples/data_ops/1170_session_encoder`. """ # %% @@ -168,5 +157,4 @@ def transform(self, X): # create a custom transformer to compute session-level features. This situation # can be avoided by using the skrub DataOps workflow, which allows for more # flexible data transformations without needing to fit everything within a -# scikit-learn pipeline. For an example of how to do this with DataOps, see -# :ref:`examples/data_ops/1170_session_encoder`. +# scikit-learn pipeline. diff --git a/examples/data_ops/1170_session_encoder.py b/examples/FIXME/1170_session_encoder.py similarity index 100% rename from examples/data_ops/1170_session_encoder.py rename to examples/FIXME/1170_session_encoder.py From 1f5fe6f8026fd6160cec6e7554d7b8a4e72508f8 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 2 Jun 2026 15:45:35 +0200 Subject: [PATCH 53/74] more on docs --- .../sessionization.rst | 62 +++++++++++++++++++ doc/multi_column_operations.rst | 1 + examples/0110_session_encoder.py | 29 ++++++--- 3 files changed, 83 insertions(+), 9 deletions(-) create mode 100644 doc/modules/multi_column_operations/sessionization.rst diff --git a/doc/modules/multi_column_operations/sessionization.rst b/doc/modules/multi_column_operations/sessionization.rst new file mode 100644 index 000000000..1fbf03125 --- /dev/null +++ b/doc/modules/multi_column_operations/sessionization.rst @@ -0,0 +1,62 @@ +.. _sessionization: + +.. |SessionEncoder| replace:: :class:`~skrub.SessionEncoder` +.. |BaseEstimator| replace:: :class:`~sklearn.base.BaseEstimator` +.. |TransformerMixin| replace:: :class:`~sklearn.base.TransformerMixin` + + +Detecting sessions in timestamped data with the SessionEncoder +---------------------------------------------------------------- + +When dealing with timestamped data (data that includes at least a timestamp column), +it may be beneficial to try and identify groups of events through **sessionization**. + +Sessionization is the process of grouping a sequence of events (like user +interactions) into meaningful sessions. A session typically starts fresh or +after a period of inactivity. + +For example, in an online retail context, you might define a new session whenever +more than 30 minutes pass with no activity from a user. On a website, a session may +define a sequence of requests made by a single end-user within a certain time duration. + +While definitions may vary depending on the specific use case, being able to detect +such "bursts" of activity by a user can help with building features that often have +greater predictive power than raw individual events. + +The |SessionEncoder| helps addressing this problem by detecting sessions based on +a timestamp column, other "session columns" (e.g., user and device) that should be +used to distinguish between sessions, and a ``session_gap``. A session is then +defined as a sequence of events that share the same value in the "session columns" +and whose events are closer to each other than the ``session_gap``. + +>>> from skrub import SessionEncoder +>>> from skrub.datasets import make_retail_events +>>> events = make_retail_events(n_events=100, random_state=0) +>>> X, y = events.X, events.y + +Once the necessary features are provided, the |SessionEncoder| +returns a dataframe that includes a ``session_id`` column, which includes an integer, +monotonically increasing ID, for each session: + +>>> se = SessionEncoder(timestamp_col="timestamp", split_by="user_id", session_gap=30 * 60) +>>> res = se.fit_transform(X) +>>> res.head(5) + user_id timestamp device_type page_category event_type time_on_page price_viewed timestamp_session_id +0 user_0164 2024-01-01 03:29:07.708922+00:00 mobile fashion page_view 134.1 309.80 59 +1 user_0164 2024-01-01 03:29:42.185048+00:00 tablet books search 103.4 11.00 59 +2 user_0164 2024-01-01 03:32:38.352703+00:00 desktop home wishlist 180.3 4.80 59 +3 user_0008 2024-01-02 10:49:56.974375+00:00 mobile books page_view 7.0 33.94 2 +4 user_0149 2024-01-04 10:00:15.882835+00:00 desktop electronics page_view 108.5 4.44 49 + +Once the session ID is available, it becomes possible to compute aggregations on +each session, for example to find the duration of a session, or the number of sessions +by a user. + +.. warning:: + +Aggregation can introduce data leakage! Records should only be aggregated from +within the training set at training time and the test set at predict time. To +ensure this is the case, any code that performs aggregation can be wrapped in a +scikit-learn |BaseEstimator| (as shown in the +:ref:`SessionEncoder example `, +or the pipeline should use the skrub :ref:`Data Ops framework`. diff --git a/doc/multi_column_operations.rst b/doc/multi_column_operations.rst index fc5e5e7a0..403f70b1a 100644 --- a/doc/multi_column_operations.rst +++ b/doc/multi_column_operations.rst @@ -15,3 +15,4 @@ multiple columns. modules/multi_column_operations/selectors modules/multi_column_operations/type_of_selectors modules/multi_column_operations/advanced_selectors + modules/multi_column_operations/sessionization diff --git a/examples/0110_session_encoder.py b/examples/0110_session_encoder.py index ceaf942d4..cf5c8f16f 100644 --- a/examples/0110_session_encoder.py +++ b/examples/0110_session_encoder.py @@ -1,14 +1,16 @@ """ +Sessions in time-based data: Predicting user purchases with the SessionEncoder +=============================================================================== + .. |SessionEncoder| replace:: :class:`~skrub.SessionEncoder` .. |make_retail_events| replace:: :func:`~skrub.datasets.make_retail_events` .. |tabular_pipeline| replace:: :func:`~skrub.tabular_pipeline` .. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` .. |DummyClassifier| replace:: :class:`~sklearn.dummy.DummyClassifier` .. |TimeSeriesSplit| replace:: :class:`~sklearn.model_selection.TimeSeriesSplit` - -Sessions in time-based data: Predicting conversion with the |SessionEncoder| -========================================================================== +.. |BaseEstimator| replace:: :class:`~sklearn.base.BaseEstimator` +.. |TransformerMixin| replace:: :class:`~sklearn.base.TransformerMixin` This example shows how to use |SessionEncoder| in a scikit-learn pipeline to create session-level features (sessionization) for conversion prediction, that is @@ -44,15 +46,21 @@ splitter = TimeSeriesSplit(n_splits=5) # %% -# We begin by generating the data with |make_retail_events| and marking feature -# and target data with |skrub.X| and |skrub.y| so they can be used -# in a DataOps workflow. - +# We begin by generating the data with |make_retail_events| and defining out +# features and target. +from skrub import TableReport from skrub.datasets import make_retail_events events = make_retail_events(n_users=20, n_events=5000, random_state=0) X, y = events.X, events.y -X +TableReport(X) +# %% +# The data contains 5000 events from 20 users, where each event is timestamped. +# Other columns include the event type, device used by the user, page category, +# time spent on page and price of the item. The target variable indicates whether +# a user session eventually contains a purchase event: all events in that session +# will have a target value of 1 if a purchase happens, and 0 otherwise. + # %% # Sanity check: evaluate a DummyClassifier on raw event data # --------------------------------------------------------------- @@ -112,8 +120,11 @@ X_sessions.head() # %% +# Defining a custom transformer for session-level aggregation +# ----------------------------------------------------------- # To avoid data leakage and maintain a clean pipeline, we can create a custom -# transformer that computes session-level aggregates within a scikit-learn pipeline. +# transformer that inherits from |BaseEstimator| and |TransformerMixin| and +# computes session-level aggregates within a scikit-learn pipeline. # This transformer will be fitted and applied separately within each fold of # cross-validation, ensuring that session features are computed only on the training # data of each fold. From ecf5e0f979c8eb2700547905c30b28d6e5e56dec Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 2 Jun 2026 16:22:38 +0200 Subject: [PATCH 54/74] _ --- skrub/datasets/_generating.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skrub/datasets/_generating.py b/skrub/datasets/_generating.py index 61211042c..9523920c7 100644 --- a/skrub/datasets/_generating.py +++ b/skrub/datasets/_generating.py @@ -238,7 +238,7 @@ def toy_cities(seed=0, size=1000, nulls=0.1, n_metrics=4): raise ValueError(f"n_metrics must be a positive integer, got {n_metrics}.") rng = np.random.default_rng(seed=seed) - now = datetime.fromisoformat("2024-01-01").timestamp() + now = datetime.datetime.fromisoformat("2024-01-01").timestamp() capitals = [ "Amsterdam", "Athens", @@ -283,9 +283,9 @@ def toy_cities(seed=0, size=1000, nulls=0.1, n_metrics=4): df_dates = pd.DataFrame(v.T, columns=["start", "end"]) if hasattr(df_dates, "map"): - df_dates = df_dates.map(datetime.fromtimestamp) + df_dates = df_dates.map(datetime.datetime.fromtimestamp) else: - df_dates = df_dates.applymap(datetime.fromtimestamp) + df_dates = df_dates.applymap(datetime.datetime.fromtimestamp) # As above, "end" sees some of its values set to null. p = rng.uniform(0, 1, size=size) df_dates["end"] = df_dates["end"].where(p >= nulls) From f694e15231a710fa58099bd3b09567e38d015862 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 2 Jun 2026 16:32:51 +0200 Subject: [PATCH 55/74] _ --- doc/modules/multi_column_operations/sessionization.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/multi_column_operations/sessionization.rst b/doc/modules/multi_column_operations/sessionization.rst index 1fbf03125..044409389 100644 --- a/doc/modules/multi_column_operations/sessionization.rst +++ b/doc/modules/multi_column_operations/sessionization.rst @@ -40,7 +40,7 @@ monotonically increasing ID, for each session: >>> se = SessionEncoder(timestamp_col="timestamp", split_by="user_id", session_gap=30 * 60) >>> res = se.fit_transform(X) ->>> res.head(5) +>>> res.head(5) # doctest: +SKIP user_id timestamp device_type page_category event_type time_on_page price_viewed timestamp_session_id 0 user_0164 2024-01-01 03:29:07.708922+00:00 mobile fashion page_view 134.1 309.80 59 1 user_0164 2024-01-01 03:29:42.185048+00:00 tablet books search 103.4 11.00 59 From 2de71c46750160d1cb809ef6a1277f3217ff43ee Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 8 Jun 2026 17:45:59 +0200 Subject: [PATCH 56/74] addressing some of the comments from the review --- .../sessionization.rst | 73 ++++++++++++++----- examples/0110_session_encoder.py | 3 + skrub/_session_encoder.py | 25 +++---- 3 files changed, 68 insertions(+), 33 deletions(-) diff --git a/doc/modules/multi_column_operations/sessionization.rst b/doc/modules/multi_column_operations/sessionization.rst index 044409389..231088c76 100644 --- a/doc/modules/multi_column_operations/sessionization.rst +++ b/doc/modules/multi_column_operations/sessionization.rst @@ -9,25 +9,31 @@ Detecting sessions in timestamped data with the SessionEncoder ---------------------------------------------------------------- When dealing with timestamped data (data that includes at least a timestamp column), -it may be beneficial to try and identify groups of events through **sessionization**. +it may be beneficial to try and identify groups of events as +:ref:`"sessions" `_, +through **sessionization**. Sessionization is the process of grouping a sequence of events (like user -interactions) into meaningful sessions. A session typically starts fresh or -after a period of inactivity. - -For example, in an online retail context, you might define a new session whenever -more than 30 minutes pass with no activity from a user. On a website, a session may +interactions) into meaningful sessions. +For example, in an online retail context you might define a new session whenever +more than 30 minutes pass with no activity from the user. On a website, a session may define a sequence of requests made by a single end-user within a certain time duration. While definitions may vary depending on the specific use case, being able to detect -such "bursts" of activity by a user can help with building features that often have -greater predictive power than raw individual events. +such "bursts" of activity by a user can often help with building features that have +greater predictive power than raw individual events, such as number of sessions or +average session duration. + +The |SessionEncoder| addresses this problem by detecting sessions based on +a timestamp column, other session-related columns (e.g., user and device) that should be +used to distinguish between sessions, and a ``session_gap``. Session-related columns +-- identified by the ``split_by`` parameter -- allow to split sessions based on +the provided parameters, for example to group user actions only if they were conducted +on the same device. -The |SessionEncoder| helps addressing this problem by detecting sessions based on -a timestamp column, other "session columns" (e.g., user and device) that should be -used to distinguish between sessions, and a ``session_gap``. A session is then -defined as a sequence of events that share the same value in the "session columns" -and whose events are closer to each other than the ``session_gap``. +A session is then defined as a sequence of events that share the same value in the +``split_by`` columns, and whose events are closer to each other than the +``session_gap``. >>> from skrub import SessionEncoder >>> from skrub.datasets import make_retail_events @@ -35,9 +41,8 @@ and whose events are closer to each other than the ``session_gap``. >>> X, y = events.X, events.y Once the necessary features are provided, the |SessionEncoder| -returns a dataframe that includes a ``session_id`` column, which includes an integer, -monotonically increasing ID, for each session: - +returns a dataframe that includes a ``timestamp_session_id`` column, which is +composed of a monotonically increasing integer ID for each session: >>> se = SessionEncoder(timestamp_col="timestamp", split_by="user_id", session_gap=30 * 60) >>> res = se.fit_transform(X) >>> res.head(5) # doctest: +SKIP @@ -48,7 +53,7 @@ monotonically increasing ID, for each session: 3 user_0008 2024-01-02 10:49:56.974375+00:00 mobile books page_view 7.0 33.94 2 4 user_0149 2024-01-04 10:00:15.882835+00:00 desktop electronics page_view 108.5 4.44 49 -Once the session ID is available, it becomes possible to compute aggregations on +With the session ID, it becomes possible to compute aggregations on each session, for example to find the duration of a session, or the number of sessions by a user. @@ -60,3 +65,37 @@ ensure this is the case, any code that performs aggregation can be wrapped in a scikit-learn |BaseEstimator| (as shown in the :ref:`SessionEncoder example `, or the pipeline should use the skrub :ref:`Data Ops framework`. + +The |SessionEncoder| includes the ``suffix`` parameter (by default +``suffix="session_id"``) to specify what the name of the new column should be. +This can help with creating multiple session IDs based on the same timestamp. +For example, we might want to create sessions based on users, and based on users +and their device: + +>>> se = SessionEncoder(timestamp_col="timestamp", +... split_by="user_id", +... session_gap=30 * 60, +... suffix="user" +... ) +>>> res = se.fit_transform(X) +>>> res.head(5) # doctest: +SKIP + user_id timestamp ... price_viewed timestamp_user +0 user_0164 2024-01-01 03:29:07.708922+00:00 ... 309.80 59 +1 user_0164 2024-01-01 03:29:42.185048+00:00 ... 11.00 59 +2 user_0164 2024-01-01 03:32:38.352703+00:00 ... 4.80 59 +3 user_0008 2024-01-02 10:49:56.974375+00:00 ... 33.94 2 +4 user_0149 2024-01-04 10:00:15.882835+00:00 ... 4.44 49 + +>>> se = SessionEncoder(timestamp_col="timestamp", +... split_by=["user_id", "device_type"], +... session_gap=30 * 60, +... suffix="user_device" +... ) +>>> res = se.fit_transform(X) +>>> res.head(5) # doctest: +SKIP + user_id timestamp ... price_viewed timestamp_user_device +0 user_0164 2024-01-01 03:29:07.708922+00:00 ... 309.80 75 +1 user_0164 2024-01-01 03:29:42.185048+00:00 ... 11.00 76 +2 user_0164 2024-01-01 03:32:38.352703+00:00 ... 4.80 74 +3 user_0008 2024-01-02 10:49:56.974375+00:00 ... 33.94 2 +4 user_0149 2024-01-04 10:00:15.882835+00:00 ... 4.44 59 diff --git a/examples/0110_session_encoder.py b/examples/0110_session_encoder.py index cf5c8f16f..cf8c4e074 100644 --- a/examples/0110_session_encoder.py +++ b/examples/0110_session_encoder.py @@ -42,6 +42,8 @@ # %% # Since this is temporal data, we use a time-aware CV strategy with # |TimeSeriesSplit| to avoid leakage. We reuse the same splitter for all evaluations. +# The dataset is sorted by timestamp, so the training set will always contain only +# past data relative to the test set. from sklearn.model_selection import TimeSeriesSplit splitter = TimeSeriesSplit(n_splits=5) @@ -111,6 +113,7 @@ # pipeline, we can achieve this by using |SessionEncoder| followed by a custom # transformer that computes session aggregates, and ensuring that the pipeline is # properly fitted within each fold of cross-validation. + # %% from skrub import SessionEncoder, tabular_pipeline diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 0711fa72f..2b3a0a017 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -1,13 +1,16 @@ """ The SessionEncoder is a transformer that takes as input: - a "timestamp" column, which identifies the time of an event -- a "by" column or list of columns, which identifies a user +- a "split_by" column or list of columns, which identifies a user - a "session_gap" value, which identifies the maximum allowed gap in seconds between events in a session -It returns a dataframe with the same number of rows as the input, but with the -column "session_id": a unique identifier for each session, which is a combination -of the "by" column(s) and a session number +It returns a dataframe with the same number of rows as the input, but with an +additional column that identifies the session to which each event belongs. +The name of the session column is "{timestamp}_{suffix}", where "timestamp" is the name +of the timestamp column, and "suffix" is a string that can be set via the "suffix" +parameter (default is "session_id"). The session column contains a unique identifier for +each session, which is a combination of the "split_by" column(s) and a session number """ import numbers @@ -38,12 +41,7 @@ def _add_session_column(X, split_by, timestamp_col, session_gap, session_column_ def _add_session_column_pandas( X, split_by, timestamp_col, session_gap, session_column_name ): - # astype(int64) is needed (rather than just int) because on windows this converts - # to int32 # check if the time difference between events exceeds the session gap - # dividing by 10**9 because int64 is in ms, while session_gap is in seconds - # as_unit("ns") is because the timestamp might be in a different unit (e.g. ms), - # and we want to make sure it's in ns for the diff to work correctly time_diff = X[timestamp_col].diff().dt.total_seconds().fillna(0) > session_gap if split_by: # check if the "split_by" column changes @@ -54,8 +52,7 @@ def _add_session_column_pandas( else: is_new_session = time_diff # Compute cumulative sum of is_new_session to create session IDs - X[session_column_name] = is_new_session.cumsum() - return X + return X.assign(**{session_column_name: is_new_session.cumsum()}) @_add_session_column.specialize("polars") @@ -63,10 +60,6 @@ def _add_session_column_polars( X, split_by, timestamp_col, session_gap, session_column_name ): # check if the time difference between events exceeds the session gap - # setting the time unit to "ns" (nanoseconds), and dividing by 10**9 because - # session_gap is in seconds - # using ns for consistency with pandas, which uses ns for timestamps, and - # to avoid issues with timestamps in different units time_diff = X[timestamp_col].diff().dt.total_seconds().fill_null(0) > session_gap if split_by: # check if the "split_by" column changes @@ -421,7 +414,7 @@ def fit_transform(self, X, y=None): for _ in self.all_inputs_ if _ not in self._split_by_columns + [self.timestamp_col] ]: - X_selected = sbd.drop_columns(X, s.cols(*cols_to_remove).expand(X)) + X_selected = sbd.drop_columns(X, cols_to_remove) else: X_selected = X From 1dc32c29243c25663b91f3962468ef32d6746ad1 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 8 Jun 2026 18:14:01 +0200 Subject: [PATCH 57/74] clean up test --- skrub/tests/test_session_encoder.py | 72 ++++++++++++++++------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 5471cbfb1..16a78ee8d 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -109,7 +109,7 @@ def example_session_data_multi_by(df_module): ("username", 6, {"alice": 3, "bob": 2, "charlie": 1}), ], ) -def test_session_encoder_basic( +def test_basic_functionality( example_session_data, by_column, expected_sessions, split_key_to_sessions ): """Test basic sessionization grouping by user_id or username.""" @@ -152,7 +152,7 @@ def test_session_encoder_basic( ("username", ["alice", "bob", "charlie"]), ], ) -def test_session_encoder_different_users_different_sessions( +def test_different_users_different_sessions( example_session_data, by_column, group_keys ): """Test that different users/groups have different session IDs.""" @@ -181,7 +181,7 @@ def test_session_encoder_different_users_different_sessions( assert len(sessions1.intersection(sessions2)) == 0 -def test_session_encoder_multi_by_columns(example_session_data_multi_by): +def test_multi_by_columns(example_session_data_multi_by): """Test sessionization when a user is identified by a combination of columns. The fixture has user_id=1 on two devices ("mobile" and "desktop"). When @@ -231,7 +231,7 @@ def test_session_encoder_multi_by_columns(example_session_data_multi_by): assert group_sessions[k1].isdisjoint(group_sessions[k2]) -def test_session_encoder_multiple_users(df_module): +def test_multiple_users(df_module): """Test sessionization with multiple users interleaved.""" timestamps = [] user_ids = [] @@ -264,7 +264,7 @@ def test_session_encoder_multiple_users(df_module): assert len(set(session_ids)) == 2 -def test_session_encoder_time_gap_threshold(df_module): +def test_time_gap_threshold(df_module): """Test that session_gap parameter correctly determines sessionization.""" timestamps = [ datetime.datetime(2024, 1, 1, 10, 0), @@ -298,7 +298,7 @@ def test_session_encoder_time_gap_threshold(df_module): assert len(set(session_ids_40)) == 1 -def test_session_encoder_no_user_column(df_module): +def test_no_user_column(df_module): """Test sessionization without a user identifier column. When ``split_by`` is None, all events are treated as from the same "user", and @@ -333,7 +333,7 @@ def test_session_encoder_no_user_column(df_module): assert session_ids[0] != session_ids[3] # Sessions are different -def test_session_encoder_single_event(df_module): +def test_single_event(df_module): """Test sessionization with single event per user.""" df = df_module.make_dataframe( { @@ -351,22 +351,6 @@ def test_session_encoder_single_event(df_module): assert session_ids[0] == 0 -def test_session_encoder_empty_dataframe(df_module): - """Test sessionization with empty dataframe.""" - df = df_module.make_dataframe( - { - "timestamp": [], - "user_id": [], - } - ) - - se = SessionEncoder(split_by="user_id", timestamp_col="timestamp", session_gap=30) - result = se.fit_transform(df) - - assert sbd.shape(result)[0] == 0 - assert "timestamp_session_id" in sbd.column_names(result) - - @pytest.mark.parametrize( "group_by_param,timestamp_col_param,expected_error_type,expected_error_match", [ @@ -396,7 +380,7 @@ def test_session_encoder_empty_dataframe(df_module): ), ], ) -def test_session_encoder_missing_column_error( +def test_missing_column_error( df_module, group_by_param, timestamp_col_param, @@ -420,7 +404,7 @@ def test_session_encoder_missing_column_error( se.fit_transform(df) -def test_session_encoder_invalid_parameters(df_module): +def test_invalid_parameters(df_module): """Test that invalid parameters raise appropriate errors.""" df = df_module.make_dataframe( { @@ -457,8 +441,21 @@ def test_session_encoder_invalid_parameters(df_module): with pytest.raises(ValueError, match="Expected a string as suffix"): se_invalid_suffix.fit_transform(df) + # Test timestamp column with non-datetime type + df_invalid_timestamp = df_module.make_dataframe( + { + "timestamp": ["2024-01-01 10:00:00"], # string instead of datetime + "user_id": [101], + } + ) + se_invalid_timestamp = SessionEncoder( + split_by="user_id", timestamp_col="timestamp", session_gap=30 + ) + with pytest.raises(TypeError, match="Expected a datetime column for timestamp_col"): + se_invalid_timestamp.fit_transform(df_invalid_timestamp) + -def test_session_encoder_preserves_columns(df_module): +def test_preserves_columns(df_module): """Test that original columns are preserved in output.""" df = df_module.make_dataframe( { @@ -481,7 +478,7 @@ def test_session_encoder_preserves_columns(df_module): assert "timestamp_session_id" in result_cols -def test_session_encoder_fit_and_transform(df_module): +def test_fit_and_transform(df_module): """Test that fit() and transform() work separately.""" df = df_module.make_dataframe( { @@ -664,7 +661,7 @@ def test_proper_suffix(timestamp, suffix, df_module): assert expected_name in sbd.column_names(result) -def test_session_encoder_preserves_input_order(df_module): +def test_preserves_input_order(df_module): """Test that the output rows are in the same order as the input rows. The encoder sorts internally to detect sessions correctly, but the result @@ -708,12 +705,21 @@ def test_error_dispatch(func): func(np.array([1])) -def test_empty_frame(df_module): - empty_df = df_module.make_dataframe({"timestamp": []}) - encoder = SessionEncoder("timestamp") - result = encoder.fit_transform(empty_df) +def test_empty_dataframe(df_module): + """Test sessionization with empty dataframe.""" + df = df_module.make_dataframe( + { + "timestamp": [], + "user_id": [], + } + ) + + se = SessionEncoder(split_by="user_id", timestamp_col="timestamp", session_gap=30) + result = se.fit_transform(df) - assert sbd.column_names(result) == ["timestamp", "timestamp_session_id"] + assert sbd.shape(result)[0] == 0 + assert "timestamp_session_id" in sbd.column_names(result) + assert sbd.column_names(result) == ["timestamp", "user_id", "timestamp_session_id"] def test_not_overwriting_columns(df_module): From d199ec241b5c6612f5cb36340a30110a602c57bf Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 8 Jun 2026 18:15:02 +0200 Subject: [PATCH 58/74] addressing more comments --- skrub/_session_encoder.py | 65 +++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 2b3a0a017..69096de72 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -86,6 +86,7 @@ def _factorize_column(X, column_name): def _factorize_column_pandas(X, column_name): if sbd.is_numeric(X[column_name]): return X[column_name] + # TODO: convert datetimes/durations to numeric codes, _ = pd.factorize(X[column_name]) return codes @@ -96,6 +97,7 @@ def _factorize_column_polars(X, column_name): if sbd.is_numeric(X[column_name]): return X[column_name] + # TODO: convert datetimes/durations to numeric return X[column_name].cast(pl.Categorical).to_physical() @@ -379,6 +381,7 @@ def fit_transform(self, X, y=None): # Checking that all the needed columns are there self._check_input_dataframe() + # check the correctness of the values of session_gap if not isinstance(self.session_gap, numbers.Number): raise TypeError(f"Expected a number, got {type(self.session_gap)}") @@ -389,18 +392,27 @@ def fit_transform(self, X, y=None): if not isinstance(self.suffix, str) or self.suffix is None: raise ValueError(f"Expected a string as suffix, got {self.suffix!r}") - self._session_id_name = f"{self.timestamp_col}_{self.suffix}" + # check that the timestamp column is of datetime type + if not sbd.is_empty_frame(X) and not sbd.is_any_date( + sbd.col(X, self.timestamp_col) + ): + raise TypeError( + "Expected a datetime column for timestamp_col," + f" got {self.timestamp_col!r}" + ) + + self.session_id_name_ = f"{self.timestamp_col}_{self.suffix}" # If the generated session id column name already exists in the input dataframe, # we add a random suffix to avoid overwriting it - if self._session_id_name in self.all_inputs_: - self._session_id_name += f"_skrub_{random_string()}" + if self.session_id_name_ in self.all_inputs_: + self.session_id_name_ += f"_skrub_{random_string()}" # if the input dataframe is empty, we can skip all the processing and # return an empty dataframe with the session_id column added if sbd.is_empty_frame(X): X = sbd.with_columns( - X, **{self._session_id_name: np.array([], dtype=np.float32)} + X, **{self.session_id_name_: np.array([], dtype=np.float32)} ) return X @@ -428,9 +440,12 @@ def fit_transform(self, X, y=None): X_factorized, factorized_by = self._factorize_columns(X_sorted) - X_with_session_id = self._add_session_id( + X_with_session_id = _add_session_column( X_factorized, factorized_by, + self.timestamp_col, + self.session_gap, + self.session_id_name_, ) # Reordering rows back to the original order X_result = sbd.sort(X_with_session_id, by=row_order_col) @@ -446,7 +461,7 @@ def fit_transform(self, X, y=None): X_result = sbd.concat(X_result, s.select(X, cols_to_remove), axis=1) # Reordering columns so that the session_id is added as the last column - X_result = s.select(X_result, self.all_inputs_ + [self._session_id_name]) + X_result = s.select(X_result, self.all_inputs_ + [self.session_id_name_]) self.all_outputs_ = sbd.column_names(X_result) return X_result @@ -471,26 +486,26 @@ def _check_input_dataframe(self): """ Check that the input columns are present and correct """ - + # TODO: move here all the error checking on the input dataframe + # and the parameters # Check that the timestamp column is present if self.timestamp_col not in self.all_inputs_: raise ValueError( f"Column '{self.timestamp_col}' not found in input dataframe" ) # check that the required columns are present in the input dataframe - self._split_by_columns = [] - if self.split_by is not None: - if isinstance(self.split_by, str): - self._split_by_columns = [self.split_by] - elif isinstance(self.split_by, Iterable) and not isinstance( - self.split_by, str - ): - self._split_by_columns = list(self.split_by) - else: - raise TypeError("split_by must be a string, a list of strings, or None") - for col in self._split_by_columns: - if col not in self.all_inputs_: - raise ValueError(f"Column '{col}' not found in input dataframe") + if self.split_by is None: + self._split_by_columns = [] + return + if isinstance(self.split_by, str): + self._split_by_columns = [self.split_by] + elif isinstance(self.split_by, Iterable): + self._split_by_columns = list(self.split_by) + else: + raise TypeError("split_by must be a string, a list of strings, or None") + for col in self._split_by_columns: + if col not in self.all_inputs_: + raise ValueError(f"Column '{col}' not found in input dataframe") def _factorize_columns(self, X): """ @@ -511,16 +526,6 @@ def _factorize_columns(self, X): return X_factorized, list(factorized_columns.keys()) - def _add_session_id(self, X_factorized, factorized_by): - X_with_session_id = _add_session_column( - X_factorized, - factorized_by, - self.timestamp_col, - self.session_gap, - self._session_id_name, - ) - return X_with_session_id - def get_feature_names_out(self, input_features=None): """Return the column names of the output of ``transform`` as a list of strings. From d22774cb4d31a7fde312ef2a542d82fc23f3e453 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 9 Jun 2026 15:08:47 +0200 Subject: [PATCH 59/74] changelog --- CHANGES.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 5e25899fb..51940d6bb 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -36,7 +36,8 @@ New Features Additionally, it is possible to provide a ``by`` column or list of columns (e.g., user ID or (user ID, user device)) to compute sessions for each grouping value. A new synthetic dataset generator has also been added. - :pr:`1930` by :user:`Riccardo Cappuzzo `.- The :class:`DropSimilar` transformer has been added, for removing columns in a + :pr:`1930` by :user:`Riccardo Cappuzzo `. +- The :class:`DropSimilar` transformer has been added, for removing columns in a dataframe that present high correlation with other columns. :pr:`2023` by :user:`Eloi Massoulié `. - :class:`ToFloat32` now allows users to specify ``decimal`` and ``thousand`` From f728604f2bf6a440124bb9bbf9b557f59dcabd68 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 9 Jun 2026 15:14:50 +0200 Subject: [PATCH 60/74] example --- skrub/_session_encoder.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 69096de72..b0bb02a72 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -144,18 +144,22 @@ class SessionEncoder(TransformerMixin, BaseEstimator): All column names in the input dataframe plus the new column that identifies the session, with name "{timestamp}_{suffix}". + session_id_name_ : str + The name of the session ID column that is added to the dataframe. This is + generated as "{timestamp_col}_{suffix}", but if this name already exists in + the input dataframe, a random suffix is added to avoid overwriting it. + Examples -------- Consider this example where we have a dataframe with user events, and we want to identify sessions based on a 30-minute gap between events for each user. Users are identified by the value of the column ``user_id``. + Note that the order of the events in the input dataframe does not matter: + the ``SessionEncoder`` will sort the events by user and timestamp before + identifying sessions (and sort them back to the original order at the end). >>> import pandas as pd - >>> from skrub import SessionEncoder >>> from datetime import datetime, timedelta - >>> encoder = SessionEncoder( - ... split_by='user_id', timestamp_col='timestamp' - ... ) >>> data = { ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], ... 'timestamp': [ @@ -176,6 +180,12 @@ class SessionEncoder(TransformerMixin, BaseEstimator): 3 bob 2024-01-01 10:00:00 login 4 bob 2024-01-01 10:20:00 purchase + We use the ``SessionEncoder`` with default ``session_gap`` of 30 minutes: + + >>> from skrub import SessionEncoder + >>> encoder = SessionEncoder( + ... split_by='user_id', timestamp_col='timestamp' + ... ) >>> result = encoder.fit_transform(df) >>> result user_id timestamp action timestamp_session_id From f215bdb3af739b106ff805e9bc8df368f0d40e61 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 9 Jun 2026 16:56:11 +0200 Subject: [PATCH 61/74] slight rewording --- skrub/_session_encoder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index b0bb02a72..030d887a1 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -174,10 +174,10 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> df = pd.DataFrame(data) >>> df user_id timestamp action - 0 alice 2024-01-01 10:00:00 login + 0 alice 2024-01-01 10:00:00 view 1 alice 2024-01-01 10:05:00 view 2 alice 2024-01-01 11:00:00 purchase - 3 bob 2024-01-01 10:00:00 login + 3 bob 2024-01-01 10:00:00 view 4 bob 2024-01-01 10:20:00 purchase We use the ``SessionEncoder`` with default ``session_gap`` of 30 minutes: @@ -189,10 +189,10 @@ class SessionEncoder(TransformerMixin, BaseEstimator): >>> result = encoder.fit_transform(df) >>> result user_id timestamp action timestamp_session_id - 0 alice 2024-01-01 10:00:00 login 0 + 0 alice 2024-01-01 10:00:00 view 0 1 alice 2024-01-01 10:05:00 view 0 2 alice 2024-01-01 11:00:00 purchase 1 - 3 bob 2024-01-01 10:00:00 login 2 + 3 bob 2024-01-01 10:00:00 view 2 4 bob 2024-01-01 10:20:00 purchase 2 In this example: From 5f04c14d090e598fca1a43a119e4bd95fa9376c6 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> Date: Mon, 15 Jun 2026 14:31:12 +0200 Subject: [PATCH 62/74] Apply suggestions from code review Co-authored-by: Lisa --- doc/modules/multi_column_operations/sessionization.rst | 10 +++++----- examples/0110_session_encoder.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/modules/multi_column_operations/sessionization.rst b/doc/modules/multi_column_operations/sessionization.rst index 231088c76..060e74a81 100644 --- a/doc/modules/multi_column_operations/sessionization.rst +++ b/doc/modules/multi_column_operations/sessionization.rst @@ -54,17 +54,17 @@ composed of a monotonically increasing integer ID for each session: 4 user_0149 2024-01-04 10:00:15.882835+00:00 desktop electronics page_view 108.5 4.44 49 With the session ID, it becomes possible to compute aggregations on -each session, for example to find the duration of a session, or the number of sessions +each session, for example to find the duration or number of sessions by a user. .. warning:: -Aggregation can introduce data leakage! Records should only be aggregated from -within the training set at training time and the test set at predict time. To +Caution! Aggregation can introduce data leakage. Records should only be aggregated from +within the training set at training time, and the test set at predict time. To ensure this is the case, any code that performs aggregation can be wrapped in a scikit-learn |BaseEstimator| (as shown in the -:ref:`SessionEncoder example `, -or the pipeline should use the skrub :ref:`Data Ops framework`. +:ref:`SessionEncoder example `), +otherwise the pipeline should use the skrub :ref:`Data Ops framework`. The |SessionEncoder| includes the ``suffix`` parameter (by default ``suffix="session_id"``) to specify what the name of the new column should be. diff --git a/examples/0110_session_encoder.py b/examples/0110_session_encoder.py index cf8c4e074..2b98761e9 100644 --- a/examples/0110_session_encoder.py +++ b/examples/0110_session_encoder.py @@ -48,7 +48,7 @@ splitter = TimeSeriesSplit(n_splits=5) # %% -# We begin by generating the data with |make_retail_events| and defining out +# We begin by generating the data with |make_retail_events| and defining our # features and target. from skrub import TableReport from skrub.datasets import make_retail_events @@ -111,7 +111,7 @@ # Note that session-based features involve aggregations, which must be performed # only on the training data within each fold to avoid leakage. In a scikit-learn # pipeline, we can achieve this by using |SessionEncoder| followed by a custom -# transformer that computes session aggregates, and ensuring that the pipeline is +# transformer that computes session aggregates, and ensures that the pipeline is # properly fitted within each fold of cross-validation. # %% @@ -163,12 +163,12 @@ def transform(self, X): print("ROC-AUC with session encoding:", scores.mean()) # %% -# As expected, the model with session encoding performs much better than the baseline +# As expected the model with session encoding performs much better than the baseline # without session features, demonstrating the value of sessionization for conversion # prediction. # # The fact that we are working with aggregation means that it was necessary to -# create a custom transformer to compute session-level features. This situation -# can be avoided by using the skrub DataOps workflow, which allows for more +# create a custom transformer to compute session-level features. However, this situation +# can be avoided entirely by using the skrub DataOps workflow, which allows for more # flexible data transformations without needing to fit everything within a # scikit-learn pipeline. From efada377b777b745eb1442a6024f96d5586e01ce Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 14:39:17 +0200 Subject: [PATCH 63/74] fixing doctest --- skrub/_session_encoder.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 030d887a1..29cb129ac 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -173,12 +173,12 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... } >>> df = pd.DataFrame(data) >>> df - user_id timestamp action - 0 alice 2024-01-01 10:00:00 view - 1 alice 2024-01-01 10:05:00 view - 2 alice 2024-01-01 11:00:00 purchase - 3 bob 2024-01-01 10:00:00 view - 4 bob 2024-01-01 10:20:00 purchase + user_id timestamp action + 0 alice 2024-01-01 10:00:00 login + 1 alice 2024-01-01 10:05:00 view + 2 alice 2024-01-01 11:00:00 purchase + 3 bob 2024-01-01 10:00:00 login + 4 bob 2024-01-01 10:20:00 purchase We use the ``SessionEncoder`` with default ``session_gap`` of 30 minutes: @@ -188,12 +188,12 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... ) >>> result = encoder.fit_transform(df) >>> result - user_id timestamp action timestamp_session_id - 0 alice 2024-01-01 10:00:00 view 0 - 1 alice 2024-01-01 10:05:00 view 0 - 2 alice 2024-01-01 11:00:00 purchase 1 - 3 bob 2024-01-01 10:00:00 view 2 - 4 bob 2024-01-01 10:20:00 purchase 2 + user_id timestamp action timestamp_session_id + 0 alice 2024-01-01 10:00:00 login 0 + 1 alice 2024-01-01 10:05:00 view 0 + 2 alice 2024-01-01 11:00:00 purchase 1 + 3 bob 2024-01-01 10:00:00 login 2 + 4 bob 2024-01-01 10:20:00 purchase 2 In this example: From 73adcfba6bb1abb04277ee1eba5324c58d6b3a17 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 15:01:35 +0200 Subject: [PATCH 64/74] reworking docstring, renaming attr --- skrub/_session_encoder.py | 129 ++++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 29cb129ac..ba4cb248a 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -144,7 +144,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): All column names in the input dataframe plus the new column that identifies the session, with name "{timestamp}_{suffix}". - session_id_name_ : str + session_id_column_ : str The name of the session ID column that is added to the dataframe. This is generated as "{timestamp_col}_{suffix}", but if this name already exists in the input dataframe, a random suffix is added to avoid overwriting it. @@ -158,27 +158,45 @@ class SessionEncoder(TransformerMixin, BaseEstimator): the ``SessionEncoder`` will sort the events by user and timestamp before identifying sessions (and sort them back to the original order at the end). + Sessions are defined by sorting over the ``split_by``columns (if provided) + and then by the timestamp. + >>> import pandas as pd >>> from datetime import datetime, timedelta >>> data = { - ... 'user_id': ['alice', 'alice', 'alice', 'bob', 'bob'], - ... 'timestamp': [ - ... pd.Timestamp('2024-01-01 10:00:00'), - ... pd.Timestamp('2024-01-01 10:05:00'), # 5 min later, same session - ... pd.Timestamp('2024-01-01 11:00:00'), # 55 min later, new session - ... pd.Timestamp('2024-01-01 10:00:00'), # Different user - ... pd.Timestamp('2024-01-01 10:20:00'), # 20 min later, same session + ... "user_id": [1, 1, 1, 1, 1, 2, 2], + ... "device_id": [ + ... "mobile", + ... "mobile", + ... "desktop", + ... "desktop", + ... "mobile", + ... "mobile", + ... "mobile", ... ], - ... 'action': ['login', 'view', 'purchase', 'login', 'purchase'] + ... "timestamp": [ + ... pd.Timestamp("2024-01-01 10:00:00"), + ... pd.Timestamp("2024-01-01 10:10:00"), # 10 min later, same session + ... pd.Timestamp("2024-01-01 10:05:00"), # Different device (sorted), + ... # different session + ... pd.Timestamp("2024-01-01 10:20:00"), # 15 min later, same session + ... # different session + ... pd.Timestamp("2024-01-01 11:20:00"), # 60 min later, new session + ... pd.Timestamp("2024-01-01 10:00:00"), # Different user + ... pd.Timestamp("2024-01-01 10:15:00"), # 15 min later, same session + ... ], + ... "action": ["view", "purchase", "view", "checkout", "view", "login", "view"], ... } >>> df = pd.DataFrame(data) - >>> df - user_id timestamp action - 0 alice 2024-01-01 10:00:00 login - 1 alice 2024-01-01 10:05:00 view - 2 alice 2024-01-01 11:00:00 purchase - 3 bob 2024-01-01 10:00:00 login - 4 bob 2024-01-01 10:20:00 purchase + >>> print(df) + user_id device_id timestamp action + 0 1 mobile 2024-01-01 10:00:00 view + 1 1 mobile 2024-01-01 10:10:00 purchase + 2 1 desktop 2024-01-01 10:05:00 view + 3 1 desktop 2024-01-01 10:20:00 checkout + 4 1 mobile 2024-01-01 11:20:00 view + 5 2 mobile 2024-01-01 10:00:00 login + 6 2 mobile 2024-01-01 10:15:00 view We use the ``SessionEncoder`` with default ``session_gap`` of 30 minutes: @@ -188,20 +206,21 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... ) >>> result = encoder.fit_transform(df) >>> result - user_id timestamp action timestamp_session_id - 0 alice 2024-01-01 10:00:00 login 0 - 1 alice 2024-01-01 10:05:00 view 0 - 2 alice 2024-01-01 11:00:00 purchase 1 - 3 bob 2024-01-01 10:00:00 login 2 - 4 bob 2024-01-01 10:20:00 purchase 2 - - In this example: - - - Alice's first two events (10:00 and 10:05) are 5 minutes apart, so they form - session 0. - - Alice's third event (11:00) is 55 minutes after the previous one, exceeding - the 30-minute gap, so it forms a new session (session 1). - - Bob's events form session 2 (different user), with both events within the + user_id device_id timestamp action timestamp_session_id + 0 1 mobile 2024-01-01 10:00:00 view 0 + 1 1 mobile 2024-01-01 10:10:00 purchase 0 + 2 1 desktop 2024-01-01 10:05:00 view 0 + 3 1 desktop 2024-01-01 10:20:00 checkout 0 + 4 1 mobile 2024-01-01 11:20:00 view 1 + 5 2 mobile 2024-01-01 10:00:00 login 2 + 6 2 mobile 2024-01-01 10:15:00 view 2 + + In this example, grouping by `user_id` results in three separate sessions: + - User 1 has two sessions (session 0 and session 1) because there is a gap of + 60 minutes between their events at 10:20 and 11:20, which exceeds the 30-minute + threshold. The first four events of user 1 belong to session 0, while the + last event belongs to session 1. + - User 2 has one session (session 2) because their events are within the 30-minute window. You can also identify users by multiple columns. For instance, the same user @@ -211,41 +230,29 @@ class SessionEncoder(TransformerMixin, BaseEstimator): ... split_by=['user_id', 'device_id'], ... timestamp_col='timestamp', ... ) - >>> data_multi = { - ... 'user_id': [1, 1, 1, 1, 2, 2], - ... 'device_id': ['mobile', 'mobile', 'desktop', 'desktop', 'mobile', 'mobile'], - ... 'timestamp': [ - ... pd.Timestamp('2024-01-01 10:00:00'), - ... pd.Timestamp('2024-01-01 10:10:00'), # 10 min later, same session - ... pd.Timestamp('2024-01-01 10:05:00'), # Different device (sorted), - ... # different session - ... pd.Timestamp('2024-01-01 10:20:00'), # 15 min later, same session - ... pd.Timestamp('2024-01-01 10:00:00'), # Different user - ... pd.Timestamp('2024-01-01 10:15:00'), # 15 min later, same session - ... ], - ... 'action': ['view', 'purchase', 'view', 'checkout', 'login', 'view'] - ... } - >>> df_multi = pd.DataFrame(data_multi) - >>> result_multi = encoder_multi.fit_transform(df_multi) - >>> result_multi + >>> result_multi = encoder_multi.fit_transform(df) + >>> print(result_multi) user_id device_id timestamp action timestamp_session_id 0 1 mobile 2024-01-01 10:00:00 view 1 1 1 mobile 2024-01-01 10:10:00 purchase 1 2 1 desktop 2024-01-01 10:05:00 view 0 3 1 desktop 2024-01-01 10:20:00 checkout 0 - 4 2 mobile 2024-01-01 10:00:00 login 2 - 5 2 mobile 2024-01-01 10:15:00 view 2 + 4 1 mobile 2024-01-01 11:20:00 view 2 + 5 2 mobile 2024-01-01 10:00:00 login 3 + 6 2 mobile 2024-01-01 10:15:00 view 3 In this example: - User 1 on "desktop" has session 0. - - User 1 on "mobile" has session 1 (different device, so separate session). - - User 2 on "mobile" has session 2 (different user). + - User 1 on "mobile" has two sessions, session 1 and session 2, because there + is a gap of 60 minutes between their events at 10:10 and 11:20, which exceeds + the 30-minute threshold. + - User 2 on "mobile" has session 3 (different user). - Note that sessions are defined by sorting over the ``split_by`` columns and then - by the timestamp: this is why, while the "desktop" - session of User 1 starts after their "mobile" session, it has session id ``0`` - since in alphabetical ordering "desktop" is first. + Note again that sessions are defined by sorting over the ``split_by`` columns + and then by the timestamp: this is why the "desktop" session of User 1 is + session 0, even though it starts after the "mobile" session in the original + dataframe. You can also use ``SessionEncoder`` without a user identifier column. In this case, sessions are separated only by time gaps. This is useful for analyzing a single @@ -411,18 +418,18 @@ def fit_transform(self, X, y=None): f" got {self.timestamp_col!r}" ) - self.session_id_name_ = f"{self.timestamp_col}_{self.suffix}" + self.session_id_column_ = f"{self.timestamp_col}_{self.suffix}" # If the generated session id column name already exists in the input dataframe, # we add a random suffix to avoid overwriting it - if self.session_id_name_ in self.all_inputs_: - self.session_id_name_ += f"_skrub_{random_string()}" + if self.session_id_column_ in self.all_inputs_: + self.session_id_column_ += f"_skrub_{random_string()}" # if the input dataframe is empty, we can skip all the processing and # return an empty dataframe with the session_id column added if sbd.is_empty_frame(X): X = sbd.with_columns( - X, **{self.session_id_name_: np.array([], dtype=np.float32)} + X, **{self.session_id_column_: np.array([], dtype=np.float32)} ) return X @@ -455,7 +462,7 @@ def fit_transform(self, X, y=None): factorized_by, self.timestamp_col, self.session_gap, - self.session_id_name_, + self.session_id_column_, ) # Reordering rows back to the original order X_result = sbd.sort(X_with_session_id, by=row_order_col) @@ -471,7 +478,7 @@ def fit_transform(self, X, y=None): X_result = sbd.concat(X_result, s.select(X, cols_to_remove), axis=1) # Reordering columns so that the session_id is added as the last column - X_result = s.select(X_result, self.all_inputs_ + [self.session_id_name_]) + X_result = s.select(X_result, self.all_inputs_ + [self.session_id_column_]) self.all_outputs_ = sbd.column_names(X_result) return X_result From 74f5991ab13f37e95b6ec75d3699d431b55d3cd9 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 15:14:15 +0200 Subject: [PATCH 65/74] moving error checking, more work on docstring --- skrub/_session_encoder.py | 63 ++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index ba4cb248a..727329ce8 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -102,26 +102,36 @@ def _factorize_column_polars(X, column_name): class SessionEncoder(TransformerMixin, BaseEstimator): - """Encode sessions from a dataframe. + """Add a session ID column to a dataframe based on time gaps and other columns. A session is defined as a sequence of events where consecutive events are separated by at most ``session_gap`` seconds. Additionally, it is possible to provide a column or list of columns that can be used to distinguish between sessions, such as user identifiers (specified by the ``split_by`` column). When the time gap between consecutive events exceeds ``session_gap``, or - when what identifies a user changes, a new session begins. All unrelated columns - are passed through unchanged. + when what identifies a user changes, a new session begins. + The encoder takes care of sorting the data by the timestamp and ``split_by`` columns + before identifying sessions, and sorting it back to the original order at the end, + so the original order of events in the input dataframe does not matter. + All unrelated columns are passed through unchanged. Parameters ---------- timestamp_col : str The name of the column that identifies the time of an event. This column - is used to determine the start and end of a session. + is used to determine the start and end of a session. ``timestamp_col`` must + be a datetime, and will be rejected otherwise. + The dataframe is sorted by ``timestamp_col`` and ``split_by`` (if provided) + before identifying sessions, and sorted back to the original order at + the end, so the order of events in the input dataframe does not matter. split_by : optional[str, list[str]], default=None The name of the column, or list of columns, to use to define sessions. A session boundary is created when the value in any of these columns changes, or when the time gap between events exceeds ``session_gap``. + The dataframe is sorted by ``split_by`` and ``timestamp_col`` before + identifying sessions, and sorted back to the original order at the end, + so the order of events in the input dataframe does not matter. This is typically a user identifier column, but it can also be used to define sessions by other groupings (e.g. user and device type). If not provided, sessions are detected based on the time gap between events, @@ -397,26 +407,7 @@ def fit_transform(self, X, y=None): self.all_inputs_ = sbd.column_names(X) # Checking that all the needed columns are there - self._check_input_dataframe() - - # check the correctness of the values of session_gap - if not isinstance(self.session_gap, numbers.Number): - raise TypeError(f"Expected a number, got {type(self.session_gap)}") - if self.session_gap <= 0: - raise ValueError( - f"session_gap must be a positive number, got {self.session_gap}" - ) - if not isinstance(self.suffix, str) or self.suffix is None: - raise ValueError(f"Expected a string as suffix, got {self.suffix!r}") - - # check that the timestamp column is of datetime type - if not sbd.is_empty_frame(X) and not sbd.is_any_date( - sbd.col(X, self.timestamp_col) - ): - raise TypeError( - "Expected a datetime column for timestamp_col," - f" got {self.timestamp_col!r}" - ) + self._check_input_dataframe(X) self.session_id_column_ = f"{self.timestamp_col}_{self.suffix}" @@ -499,17 +490,35 @@ def transform(self, X): check_is_fitted(self) return self.fit_transform(X) - def _check_input_dataframe(self): + def _check_input_dataframe(self, X): """ Check that the input columns are present and correct """ - # TODO: move here all the error checking on the input dataframe - # and the parameters + # check the correctness of the values of session_gap + if not isinstance(self.session_gap, numbers.Number): + raise TypeError(f"Expected a number, got {type(self.session_gap)}") + if self.session_gap <= 0: + raise ValueError( + f"session_gap must be a positive number, got {self.session_gap}" + ) + # check that the suffix is a string + if not isinstance(self.suffix, str) or self.suffix is None: + raise ValueError(f"Expected a string as suffix, got {self.suffix!r}") + # Check that the timestamp column is present if self.timestamp_col not in self.all_inputs_: raise ValueError( f"Column '{self.timestamp_col}' not found in input dataframe" ) + # check that the timestamp column is of datetime type + if not sbd.is_empty_frame(X) and not sbd.is_any_date( + sbd.col(X, self.timestamp_col) + ): + raise TypeError( + "Expected a datetime column for timestamp_col," + f" got {self.timestamp_col!r}" + ) + # check that the required columns are present in the input dataframe if self.split_by is None: self._split_by_columns = [] From 49a458361c80d143853fb85f67b1c44a72bc8bd6 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 15:50:49 +0200 Subject: [PATCH 66/74] simplifying part of the code --- skrub/_session_encoder.py | 42 ++++++++++++--------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 727329ce8..a5d7c0f04 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -420,30 +420,21 @@ def fit_transform(self, X, y=None): # return an empty dataframe with the session_id column added if sbd.is_empty_frame(X): X = sbd.with_columns( - X, **{self.session_id_column_: np.array([], dtype=np.float32)} + X, **{self.session_id_column_: np.array([], dtype=np.int64)} ) return X # Adding a row order column to sort lines back row_order_col = f"_row_order_skrub_{random_string()}" - X = sbd.with_columns(X, **{row_order_col: range(X.shape[0])}) - - # Dropping unneeded columns to reduce the sorting overhead - if cols_to_remove := [ - _ - for _ in self.all_inputs_ - if _ not in self._split_by_columns + [self.timestamp_col] - ]: - X_selected = sbd.drop_columns(X, cols_to_remove) - else: - X_selected = X + X_with_order = sbd.with_columns(X, **{row_order_col: range(X.shape[0])}) # sort the input dataframe by the "split_by" and "timestamp" columns - sort_by = ( - self._split_by_columns + [self.timestamp_col] - if self.split_by is not None - else [self.timestamp_col] - ) + # _split_by_columns can be empty if self.split_by is None + sort_by = self._split_by_columns + [self.timestamp_col] + + # Selecting only the columns needed for sessionization and sorting them + # to ensure that the sessionization is done correctly + X_selected = s.select(X_with_order, sort_by + [row_order_col]) X_sorted = sbd.sort(X_selected, by=sort_by) X_factorized, factorized_by = self._factorize_columns(X_sorted) @@ -458,18 +449,11 @@ def fit_transform(self, X, y=None): # Reordering rows back to the original order X_result = sbd.sort(X_with_session_id, by=row_order_col) - # drop the factorized "split_by" columns if the original "split_by" - # columns were not numeric, and the column used to reorder - to_drop = [col for col in factorized_by if col not in self._split_by_columns] - to_drop += [row_order_col] - X_result = sbd.drop_columns(X_result, to_drop) - - # If unrelated columns were removed earlier, bring them back here - if cols_to_remove: - X_result = sbd.concat(X_result, s.select(X, cols_to_remove), axis=1) - - # Reordering columns so that the session_id is added as the last column - X_result = s.select(X_result, self.all_inputs_ + [self.session_id_column_]) + # Concatenating the session_id column to the original dataframe, so that + # all unrelated columns are passed through unchanged. + # Doing this has the added benefit of adding the session_id column at the + # end of the dataframe. + X_result = sbd.concat(X, s.select(X_result, self.session_id_column_), axis=1) self.all_outputs_ = sbd.column_names(X_result) return X_result From 77942d250238cf8681efed79f521b72fc7e727b7 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 16:15:20 +0200 Subject: [PATCH 67/74] addressing comments --- skrub/_session_encoder.py | 52 ++++++++++++--------- skrub/tests/test_session_encoder.py | 71 +++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 21 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index a5d7c0f04..82239722a 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -86,7 +86,10 @@ def _factorize_column(X, column_name): def _factorize_column_pandas(X, column_name): if sbd.is_numeric(X[column_name]): return X[column_name] - # TODO: convert datetimes/durations to numeric + if sbd.is_any_date(X[column_name]): + return X[column_name].astype(np.int64) + if sbd.is_duration(X[column_name]): + return X[column_name].dt.total_seconds().astype(np.int64) codes, _ = pd.factorize(X[column_name]) return codes @@ -97,7 +100,10 @@ def _factorize_column_polars(X, column_name): if sbd.is_numeric(X[column_name]): return X[column_name] - # TODO: convert datetimes/durations to numeric + if sbd.is_any_date(X[column_name]): + return X[column_name].cast(pl.Int64) + if sbd.is_duration(X[column_name]): + return X[column_name].dt.total_seconds().cast(pl.Int64) return X[column_name].cast(pl.Categorical).to_physical() @@ -416,6 +422,28 @@ def fit_transform(self, X, y=None): if self.session_id_column_ in self.all_inputs_: self.session_id_column_ += f"_skrub_{random_string()}" + X_result = self.transform(X, y) + + return X_result + + def transform(self, X, y=None): + """Transform the data by encoding sessions. + + Parameters + ---------- + X : pandas.DataFrame or polars.DataFrame + The input dataframe. + + y : None + Ignored. + + Returns + ------- + pandas.DataFrame or polars.DataFrame + The transformed dataframe with session information. + """ + check_is_fitted(self) + # if the input dataframe is empty, we can skip all the processing and # return an empty dataframe with the session_id column added if sbd.is_empty_frame(X): @@ -458,22 +486,6 @@ def fit_transform(self, X, y=None): self.all_outputs_ = sbd.column_names(X_result) return X_result - def transform(self, X): - """Transform the data by encoding sessions. - - Parameters - ---------- - X : pandas.DataFrame or polars.DataFrame - The input dataframe. - - Returns - ------- - pandas.DataFrame or polars.DataFrame - The transformed dataframe with session information. - """ - check_is_fitted(self) - return self.fit_transform(X) - def _check_input_dataframe(self, X): """ Check that the input columns are present and correct @@ -523,12 +535,10 @@ def _factorize_columns(self, X): ensure that the diff operation works correctly """ - if not self.split_by: + if self.split_by is None: return X, [] factorized_columns = { f"{col}_factorized_skrub_{random_string()}": _factorize_column(X, col) - if not sbd.is_numeric(X[col]) - else X[col] for col in self._split_by_columns } diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 16a78ee8d..5d8adebfe 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -546,6 +546,49 @@ def test_factorize_column_numeric(df_module): df_module.assert_column_equal(codes, df["user_id"]) +def test_factorize_column_date(df_module): + """_factorize_column on a datetime column should return int64 codes.""" + df = df_module.make_dataframe( + { + "ts": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 5), + datetime.datetime(2024, 1, 1, 10, 0), # same as first + datetime.datetime(2024, 1, 1, 11, 0), + ] + } + ) + codes = _factorize_column(df, "ts") + # First and third row have the same timestamp -> same code + assert codes[0] == codes[2] + # Different timestamps -> different codes + assert codes[0] != codes[1] + assert codes[1] != codes[3] + assert codes[0] != codes[3] + + +def test_factorize_column_duration(df_module): + """_factorize_column on a duration column should return int64 codes + representing total seconds.""" + df = df_module.make_dataframe( + { + "dur": [ + datetime.timedelta(minutes=30), + datetime.timedelta(hours=1), + datetime.timedelta(minutes=30), # same as first + datetime.timedelta(minutes=90), + ] + } + ) + codes = _factorize_column(df, "dur") + # First and third row have the same duration -> same code + assert codes[0] == codes[2] + # Different durations -> different codes + assert codes[0] != codes[1] + assert codes[1] != codes[3] + assert codes[0] != codes[3] + + def test_check_is_new_session_no_by(df_module): """_check_is_new_session with an empty group_by-list uses only the time gap.""" df = df_module.make_dataframe( @@ -765,3 +808,31 @@ def test_not_overwriting_columns(df_module): # The original "timestamp_custom_name" column should not be overwritten # The new column has name "timestamp_custom_name_skrub_RANDOM_SUFFIX" assert col_names[2].removeprefix("timestamp_custom_name").startswith("_skrub_") + + +def test_empty_column_name(df_module): + """Test that an empty string as column name is a valid split by column name.""" + df = df_module.make_dataframe( + { + "timestamp": [ + datetime.datetime(2024, 1, 1, 10, 0), + datetime.datetime(2024, 1, 1, 10, 10), # 10 min — within gap + datetime.datetime(2024, 1, 1, 11, 0), # 50 min — exceeds gap + datetime.datetime(2024, 1, 1, 11, 5), # 5 min — within gap + ], + "": [1, 1, 1, 2], + } + ) + encoder = SessionEncoder(timestamp_col="timestamp", split_by="") + result = encoder.fit_transform(df) + assert sbd.shape(result)[0] == 4 + assert "timestamp_session_id" in sbd.column_names(result) + assert sbd.to_list(sbd.col(result, "timestamp_session_id")) == [0, 0, 1, 2] + + # Check that not passing the split_by parameter (default None) also works + # and returns the proper result + encoder = SessionEncoder(timestamp_col="timestamp") + result = encoder.fit_transform(df) + assert sbd.shape(result)[0] == 4 + assert "timestamp_session_id" in sbd.column_names(result) + assert sbd.to_list(sbd.col(result, "timestamp_session_id")) == [0, 0, 1, 1] From 05c7bf700c16984be4eeedd4490599ec85153fe4 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 17:58:32 +0200 Subject: [PATCH 68/74] removing factorizer --- skrub/_session_encoder.py | 152 ++++++++++++---------------- skrub/tests/test_session_encoder.py | 98 ++---------------- 2 files changed, 74 insertions(+), 176 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 82239722a..82f6f7412 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -26,85 +26,80 @@ from ._dispatch import dispatch, raise_dispatch_unregistered_type from ._utils import random_string -try: - import polars as pl -except ImportError: - pass - @dispatch -def _add_session_column(X, split_by, timestamp_col, session_gap, session_column_name): +def _add_session_column( + X, split_by_columns, timestamp_col, session_gap, session_id_column_ +): raise_dispatch_unregistered_type(X, kind="Dataframe") @_add_session_column.specialize("pandas") def _add_session_column_pandas( - X, split_by, timestamp_col, session_gap, session_column_name + X, split_by_columns, timestamp_col, session_gap, session_id_column_ ): - # check if the time difference between events exceeds the session gap - time_diff = X[timestamp_col].diff().dt.total_seconds().fillna(0) > session_gap - if split_by: - # check if the "split_by" column changes - has_split_change = (X[split_by].diff().fillna(0) != 0).any(axis=1) - # a new session starts if either the "split_by" column changes or the time - # gap is exceeded - is_new_session = has_split_change | time_diff - else: - is_new_session = time_diff - # Compute cumulative sum of is_new_session to create session IDs - return X.assign(**{session_column_name: is_new_session.cumsum()}) + groups = X.groupby(split_by_columns) if len(split_by_columns) > 0 else [("", X)] + rolling_session_id = 0 + + groups_with_session_ids = [] + + for group_key, group_df in groups: + # Sort the group by timestamp + group_df_sorted = group_df.sort_values(by=timestamp_col) + # Compute time differences between consecutive events + time_diffs = group_df_sorted[timestamp_col].diff().dt.total_seconds() + # Identify session boundaries based on time gaps + session_boundaries = (time_diffs > session_gap) | (time_diffs.isna()) + # Assign session IDs based on cumulative sum of session boundaries + session_ids = session_boundaries.cumsum() - 1 + rolling_session_id + # Update rolling_session_id for the next group + rolling_session_id = session_ids.max() + 1 + # Add the session IDs to the original dataframe + group_df_sorted = group_df_sorted.assign( + **{ + session_id_column_: pd.Series( + session_ids.values, index=group_df_sorted.index + ) + } + ) + groups_with_session_ids.append((group_key, group_df_sorted)) + res = sbd.concat(*[group_df for _, group_df in groups_with_session_ids], axis=0) + return res @_add_session_column.specialize("polars") def _add_session_column_polars( - X, split_by, timestamp_col, session_gap, session_column_name + X, split_by_columns, timestamp_col, session_gap, session_id_column_ ): - # check if the time difference between events exceeds the session gap - time_diff = X[timestamp_col].diff().dt.total_seconds().fill_null(0) > session_gap - if split_by: - # check if the "split_by" column changes - has_split_change = X.select( - pl.any_horizontal(pl.col(split_by).diff().fill_null(0) != 0) - ).to_series() - # a new session starts if either the "split_by" column changes or the time - # gap is exceeded - is_new_session = has_split_change | time_diff - else: - is_new_session = time_diff - # Add session_id by computing cumulative sum of is_new_session - return X.with_columns(is_new_session.cum_sum().alias(session_column_name)) - - -@dispatch -def _factorize_column(X, column_name): - # Factorization is done so different groups can be found by doing a simple - # numeric difference - raise_dispatch_unregistered_type(X, kind="Dataframe") - - -@_factorize_column.specialize("pandas") -def _factorize_column_pandas(X, column_name): - if sbd.is_numeric(X[column_name]): - return X[column_name] - if sbd.is_any_date(X[column_name]): - return X[column_name].astype(np.int64) - if sbd.is_duration(X[column_name]): - return X[column_name].dt.total_seconds().astype(np.int64) - codes, _ = pd.factorize(X[column_name]) - return codes - - -@_factorize_column.specialize("polars") -def _factorize_column_polars(X, column_name): - import polars as pl - - if sbd.is_numeric(X[column_name]): - return X[column_name] - if sbd.is_any_date(X[column_name]): - return X[column_name].cast(pl.Int64) - if sbd.is_duration(X[column_name]): - return X[column_name].dt.total_seconds().cast(pl.Int64) - return X[column_name].cast(pl.Categorical).to_physical() + groups = ( + X.group_by(split_by_columns, maintain_order=True) + if len(split_by_columns) > 0 + else [("", X)] + ) + rolling_session_id = 0 + + groups_with_session_ids = [] + + for group_key, group_df in groups: + # Sort the group by timestamp + group_df_sorted = group_df.sort(by=timestamp_col) + # Compute time differences between consecutive events + time_diffs = group_df_sorted[timestamp_col].diff().dt.total_seconds() + # Identify session boundaries based on time gaps + session_boundaries = (time_diffs > session_gap) | ( + time_diffs.is_nan() + ).fill_null(True) + # Assign session IDs based on cumulative sum of session boundaries + session_ids = session_boundaries.cum_sum() - 1 + rolling_session_id + # Update rolling_session_id for the next group + rolling_session_id = session_ids.max() + 1 + # Add the session IDs to the original dataframe + group_df_sorted = group_df_sorted.with_columns( + [session_ids.alias(session_id_column_)] + ) + groups_with_session_ids.append(group_df_sorted) + res = sbd.concat(*groups_with_session_ids, axis=0) + return res class SessionEncoder(TransformerMixin, BaseEstimator): @@ -463,13 +458,9 @@ def transform(self, X, y=None): # Selecting only the columns needed for sessionization and sorting them # to ensure that the sessionization is done correctly X_selected = s.select(X_with_order, sort_by + [row_order_col]) - X_sorted = sbd.sort(X_selected, by=sort_by) - - X_factorized, factorized_by = self._factorize_columns(X_sorted) - X_with_session_id = _add_session_column( - X_factorized, - factorized_by, + X_selected, + self._split_by_columns, self.timestamp_col, self.session_gap, self.session_id_column_, @@ -529,23 +520,6 @@ def _check_input_dataframe(self, X): if col not in self.all_inputs_: raise ValueError(f"Column '{col}' not found in input dataframe") - def _factorize_columns(self, X): - """ - convert split_by columns to numerical columns if they're not already, to - ensure that the diff operation works correctly - """ - - if self.split_by is None: - return X, [] - factorized_columns = { - f"{col}_factorized_skrub_{random_string()}": _factorize_column(X, col) - for col in self._split_by_columns - } - - X_factorized = sbd.with_columns(X, **factorized_columns) - - return X_factorized, list(factorized_columns.keys()) - def get_feature_names_out(self, input_features=None): """Return the column names of the output of ``transform`` as a list of strings. diff --git a/skrub/tests/test_session_encoder.py b/skrub/tests/test_session_encoder.py index 5d8adebfe..f92a27eb0 100644 --- a/skrub/tests/test_session_encoder.py +++ b/skrub/tests/test_session_encoder.py @@ -1,5 +1,4 @@ import datetime -from functools import partial import numpy as np import pandas as pd @@ -10,7 +9,6 @@ from skrub import _dataframe as sbd from skrub._session_encoder import ( _add_session_column, - _factorize_column, ) @@ -520,75 +518,6 @@ def test_get_feature_names(df_module): assert set(feature_names) == {"timestamp", "user_id", "timestamp_session_id"} -# --------------------------------------------------------------------------- -# Tests for the internal dispatched helper functions -# --------------------------------------------------------------------------- - - -def test_factorize_column_string(df_module): - """_factorize_column should map string values to consecutive integer codes.""" - df = df_module.make_dataframe({"user": ["alice", "bob", "alice", "charlie"]}) - codes = _factorize_column(df, "user") - - # alice appears first, so it should get code 0 - assert codes[0] == codes[2] # both "alice" → same code - assert codes[1] != codes[0] # "bob" differs from "alice" - assert codes[3] != codes[0] # "charlie" differs from "alice" - assert codes[1] != codes[3] # "bob" differs from "charlie" - assert all(int(c) == expected for c, expected in zip(codes, [0, 1, 0, 2])) - - -def test_factorize_column_numeric(df_module): - """_factorize_column on a numeric column should return the column unchanged.""" - df = df_module.make_dataframe({"user_id": [10, 20, 10, 30]}) - codes = _factorize_column(df, "user_id") - - df_module.assert_column_equal(codes, df["user_id"]) - - -def test_factorize_column_date(df_module): - """_factorize_column on a datetime column should return int64 codes.""" - df = df_module.make_dataframe( - { - "ts": [ - datetime.datetime(2024, 1, 1, 10, 0), - datetime.datetime(2024, 1, 1, 10, 5), - datetime.datetime(2024, 1, 1, 10, 0), # same as first - datetime.datetime(2024, 1, 1, 11, 0), - ] - } - ) - codes = _factorize_column(df, "ts") - # First and third row have the same timestamp -> same code - assert codes[0] == codes[2] - # Different timestamps -> different codes - assert codes[0] != codes[1] - assert codes[1] != codes[3] - assert codes[0] != codes[3] - - -def test_factorize_column_duration(df_module): - """_factorize_column on a duration column should return int64 codes - representing total seconds.""" - df = df_module.make_dataframe( - { - "dur": [ - datetime.timedelta(minutes=30), - datetime.timedelta(hours=1), - datetime.timedelta(minutes=30), # same as first - datetime.timedelta(minutes=90), - ] - } - ) - codes = _factorize_column(df, "dur") - # First and third row have the same duration -> same code - assert codes[0] == codes[2] - # Different durations -> different codes - assert codes[0] != codes[1] - assert codes[1] != codes[3] - assert codes[0] != codes[3] - - def test_check_is_new_session_no_by(df_module): """_check_is_new_session with an empty group_by-list uses only the time gap.""" df = df_module.make_dataframe( @@ -604,7 +533,7 @@ def test_check_is_new_session_no_by(df_module): session_id = sbd.to_list( sbd.col( _add_session_column( - df, [], "timestamp", 30 * 60, session_column_name="timestamp_session_id" + df, [], "timestamp", 30 * 60, session_id_column_="timestamp_session_id" ), "timestamp_session_id", ) @@ -628,7 +557,9 @@ def test_add_session_column_old_pandas(df_module): ) session_id = sbd.to_list( sbd.col( - _add_session_column(df, [], "timestamp", 30 * 60, "timestamp_session_id"), + _add_session_column( + df, [], "timestamp", 30 * 60, session_id_column_="timestamp_session_id" + ), "timestamp_session_id", ) ) @@ -730,22 +661,15 @@ def test_preserves_input_order(df_module): assert sbd.to_list(sbd.col(result, "timestamp")) == timestamps -@pytest.mark.parametrize( - "func", - ( - partial( - _add_session_column, - split_by=[], +def test_error_dispatch(): + with pytest.raises(TypeError, match="Expecting a Pandas or Polars Dataframe"): + _add_session_column( + np.array([1]), + split_by_columns=[], timestamp_col="timestamp", session_gap=30, - session_column_name="timestamp_session_id", - ), - partial(_factorize_column, column_name="user_id"), - ), -) -def test_error_dispatch(func): - with pytest.raises(TypeError, match="Expecting a Pandas or Polars Dataframe"): - func(np.array([1])) + session_id_column_="timestamp_session_id", + ) def test_empty_dataframe(df_module): From b3895757561f7ccc953ecebbbae306e97b783600 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 18:03:17 +0200 Subject: [PATCH 69/74] docstring --- skrub/_session_encoder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 82f6f7412..dba1708c2 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -109,8 +109,9 @@ class SessionEncoder(TransformerMixin, BaseEstimator): by at most ``session_gap`` seconds. Additionally, it is possible to provide a column or list of columns that can be used to distinguish between sessions, such as user identifiers (specified by the ``split_by`` column). - When the time gap between consecutive events exceeds ``session_gap``, or - when what identifies a user changes, a new session begins. + Sessions change when either the time gap between events exceeds ``session_gap``, + or the identifiers in ``split_by`` column(s) change. + The encoder takes care of sorting the data by the timestamp and ``split_by`` columns before identifying sessions, and sorting it back to the original order at the end, so the original order of events in the input dataframe does not matter. From 967595052fe88afad3b607d3b6ce8700df206f58 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 18:08:31 +0200 Subject: [PATCH 70/74] doc fixes --- CHANGES.rst | 9 +++++---- skrub/_session_encoder.py | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e526478df..951af8c56 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -31,11 +31,12 @@ New Features :meth:`DataOp.skb.eval`, :meth:`SkrubLearner.predict`, etc., or in :meth:`DataOp.skb.find` or :meth:`SkrubLearner.truncated_after`. :pr:`2062` by :user:`Jérôme Dockès `. -- The :class:`SessionEncoder` is now available. This encoder takes a dataframe with - a timestamp column and computes sessions based on the given session duration. - Additionally, it is possible to provide a ``by`` column or list of columns +- The :class:`SessionEncoder` is now available. This encoder adds a `session_id` + column, which groups together events that occur within the given session gap. + Additionally, it is possible to provide a ``split_by`` column or list of columns (e.g., user ID or (user ID, user device)) to compute sessions for each grouping - value. A new synthetic dataset generator has also been added. + value. A new synthetic dataset generator (:meth:`~skrub.datasets.make_retail_events`) + has also been added. :pr:`1930` by :user:`Riccardo Cappuzzo `. - The :class:`DropSimilar` transformer has been added, for removing columns in a dataframe that present high correlation with other columns. :pr:`2023` by diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index dba1708c2..3a5a28517 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -228,6 +228,7 @@ class SessionEncoder(TransformerMixin, BaseEstimator): 6 2 mobile 2024-01-01 10:15:00 view 2 In this example, grouping by `user_id` results in three separate sessions: + - User 1 has two sessions (session 0 and session 1) because there is a gap of 60 minutes between their events at 10:20 and 11:20, which exceeds the 30-minute threshold. The first four events of user 1 belong to session 0, while the From 0787468447fdcce774b1f9b8252548c3eb139d6d Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Mon, 15 Jun 2026 18:16:30 +0200 Subject: [PATCH 71/74] pandas grrr --- skrub/_session_encoder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 3a5a28517..386b213ec 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -38,7 +38,9 @@ def _add_session_column( def _add_session_column_pandas( X, split_by_columns, timestamp_col, session_gap, session_id_column_ ): - groups = X.groupby(split_by_columns) if len(split_by_columns) > 0 else [("", X)] + # needed to avoid a warning with min deps + grouper = split_by_columns[0] if len(split_by_columns) == 1 else split_by_columns + groups = X.groupby(grouper) if len(split_by_columns) > 0 else [("", X)] rolling_session_id = 0 groups_with_session_ids = [] From 29514866dbea11d0cb9b554b3764b2e9d6b4b044 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 16 Jun 2026 09:08:15 +0200 Subject: [PATCH 72/74] fixing test on min deps --- skrub/_session_encoder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 386b213ec..3d16a20ba 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -89,7 +89,8 @@ def _add_session_column_polars( time_diffs = group_df_sorted[timestamp_col].diff().dt.total_seconds() # Identify session boundaries based on time gaps session_boundaries = (time_diffs > session_gap) | ( - time_diffs.is_nan() + # need both is_nan and is_null to handle older versions of polars + time_diffs.is_nan() | time_diffs.is_null() ).fill_null(True) # Assign session IDs based on cumulative sum of session boundaries session_ids = session_boundaries.cum_sum() - 1 + rolling_session_id From 326dbb9660763fedde4914129250549bcb2cd3db Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 16 Jun 2026 09:09:09 +0200 Subject: [PATCH 73/74] adding a comment --- skrub/_session_encoder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skrub/_session_encoder.py b/skrub/_session_encoder.py index 3d16a20ba..90c09e562 100644 --- a/skrub/_session_encoder.py +++ b/skrub/_session_encoder.py @@ -53,6 +53,7 @@ def _add_session_column_pandas( # Identify session boundaries based on time gaps session_boundaries = (time_diffs > session_gap) | (time_diffs.isna()) # Assign session IDs based on cumulative sum of session boundaries + # cumsum - 1 to start session IDs at 0 session_ids = session_boundaries.cumsum() - 1 + rolling_session_id # Update rolling_session_id for the next group rolling_session_id = session_ids.max() + 1 @@ -93,6 +94,7 @@ def _add_session_column_polars( time_diffs.is_nan() | time_diffs.is_null() ).fill_null(True) # Assign session IDs based on cumulative sum of session boundaries + # cumsum - 1 to start session IDs at 0 session_ids = session_boundaries.cum_sum() - 1 + rolling_session_id # Update rolling_session_id for the next group rolling_session_id = session_ids.max() + 1 From e04fd29d0b469b475c16e2f1f9c9d05865c9dbb4 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 16 Jun 2026 10:43:42 +0200 Subject: [PATCH 74/74] changelog --- CHANGES.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 951af8c56..89606ffbc 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -35,8 +35,10 @@ New Features column, which groups together events that occur within the given session gap. Additionally, it is possible to provide a ``split_by`` column or list of columns (e.g., user ID or (user ID, user device)) to compute sessions for each grouping - value. A new synthetic dataset generator (:meth:`~skrub.datasets.make_retail_events`) - has also been added. + value. + :pr:`1930` by :user:`Riccardo Cappuzzo `. +- A new synthetic dataset generator for timestamped data and session-based + operations has been added: :meth:`~skrub.datasets.make_retail_events`. :pr:`1930` by :user:`Riccardo Cappuzzo `. - The :class:`DropSimilar` transformer has been added, for removing columns in a dataframe that present high correlation with other columns. :pr:`2023` by