From 9374425765c47e8d39721bb0f46c9a7efca26b52 Mon Sep 17 00:00:00 2001 From: Janne Santana Date: Wed, 10 Jun 2026 14:33:08 +0200 Subject: [PATCH 1/7] initial commit --- skrub/_drop_uninformative.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/skrub/_drop_uninformative.py b/skrub/_drop_uninformative.py index e20e07e93..f8df2560b 100644 --- a/skrub/_drop_uninformative.py +++ b/skrub/_drop_uninformative.py @@ -85,10 +85,12 @@ def __init__( drop_if_constant=False, drop_if_unique=False, drop_null_fraction=1.0, + threshold=0.0 ): self.drop_if_constant = drop_if_constant self.drop_if_unique = drop_if_unique self.drop_null_fraction = drop_null_fraction + self.threshold = threshold def _check_params(self): if not isinstance(self.drop_if_constant, bool): @@ -126,9 +128,16 @@ def _drop_if_too_many_nulls(self, column): return self._null_count / len(column) > self.drop_null_fraction def _drop_if_constant(self, column): - if self.drop_if_constant: - if (sbd.n_unique(column) == 1) and (self._null_count == 0): - return True + if self.drop_if_constant: + if sbd.is_numeric(column) == 1 and (self._null_count == 0): # if numeric or boolean + if sbd.std(column)**2 <= self.threshold: # check if passes the threshold + return True + else: + return False + elif ((sbd.n_unique(column) == 1) and (self._null_count == 0)): + # use the original logic to deal with the other cases + return True + return False def _drop_if_unique(self, column): From b2c3948204040ba79b13a8c6a54d0509156d33e2 Mon Sep 17 00:00:00 2001 From: Janne Santana Date: Wed, 10 Jun 2026 15:19:48 +0200 Subject: [PATCH 2/7] adding test --- skrub/tests/test_drop_uninformative.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skrub/tests/test_drop_uninformative.py b/skrub/tests/test_drop_uninformative.py index e3f04cacd..b0be9c5d5 100644 --- a/skrub/tests/test_drop_uninformative.py +++ b/skrub/tests/test_drop_uninformative.py @@ -130,6 +130,9 @@ def drop_if_constant_table(df_module): "const", None, ], + "low_variance": [ + 0.01,0.02,0.05 + ] } ) @@ -141,6 +144,7 @@ def drop_if_constant_table(df_module): (dict(drop_if_constant=True), "constant_float", []), (dict(drop_if_constant=True), "constant_float_with_nulls", [2.5, 2.5, np.nan]), (dict(drop_if_constant=True), "constant_str", []), + (dict(drop_if_constant=True,threshold=0.5), "low_variance", []), ( dict(drop_if_constant=True), "constant_str_with_nulls", From 47232f0f2ac6dfc0b335013261f1bedf90205c41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= Date: Wed, 10 Jun 2026 15:23:09 +0200 Subject: [PATCH 3/7] Formatting --- skrub/_drop_uninformative.py | 20 ++++++++++++-------- skrub/tests/test_drop_uninformative.py | 6 ++---- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/skrub/_drop_uninformative.py b/skrub/_drop_uninformative.py index f8df2560b..21119831a 100644 --- a/skrub/_drop_uninformative.py +++ b/skrub/_drop_uninformative.py @@ -85,7 +85,7 @@ def __init__( drop_if_constant=False, drop_if_unique=False, drop_null_fraction=1.0, - threshold=0.0 + threshold=0.0, ): self.drop_if_constant = drop_if_constant self.drop_if_unique = drop_if_unique @@ -128,16 +128,20 @@ def _drop_if_too_many_nulls(self, column): return self._null_count / len(column) > self.drop_null_fraction def _drop_if_constant(self, column): - if self.drop_if_constant: - if sbd.is_numeric(column) == 1 and (self._null_count == 0): # if numeric or boolean - if sbd.std(column)**2 <= self.threshold: # check if passes the threshold - return True + if self.drop_if_constant: + if sbd.is_numeric(column) == 1 and ( + self._null_count == 0 + ): # if numeric or boolean + if ( + sbd.std(column) ** 2 <= self.threshold + ): # check if passes the threshold + return True else: return False - elif ((sbd.n_unique(column) == 1) and (self._null_count == 0)): + elif (sbd.n_unique(column) == 1) and (self._null_count == 0): # use the original logic to deal with the other cases - return True - + return True + return False def _drop_if_unique(self, column): diff --git a/skrub/tests/test_drop_uninformative.py b/skrub/tests/test_drop_uninformative.py index b0be9c5d5..e2f0b062b 100644 --- a/skrub/tests/test_drop_uninformative.py +++ b/skrub/tests/test_drop_uninformative.py @@ -130,9 +130,7 @@ def drop_if_constant_table(df_module): "const", None, ], - "low_variance": [ - 0.01,0.02,0.05 - ] + "low_variance": [0.01, 0.02, 0.05], } ) @@ -144,7 +142,7 @@ def drop_if_constant_table(df_module): (dict(drop_if_constant=True), "constant_float", []), (dict(drop_if_constant=True), "constant_float_with_nulls", [2.5, 2.5, np.nan]), (dict(drop_if_constant=True), "constant_str", []), - (dict(drop_if_constant=True,threshold=0.5), "low_variance", []), + (dict(drop_if_constant=True, threshold=0.5), "low_variance", []), ( dict(drop_if_constant=True), "constant_str_with_nulls", From 25f803b7066f6a1a929d72a7bdfcb78584544c12 Mon Sep 17 00:00:00 2001 From: Janne Santana Date: Wed, 10 Jun 2026 15:29:18 +0200 Subject: [PATCH 4/7] updating CHANGES.rst --- CHANGES.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 40c6d1230..296717cb6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -54,6 +54,8 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. +- The :class:`DropUninformative` was improved so that `drop_if_constant` becomes a variance threshold and it acts similarly to the VarianceThreshold transformer. + :pr:`2109` by :user:`Janne de Melo Santana `. Bugfixes -------- From d0f8fc8a05c148225580558c12e8c3d5d7c78d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= Date: Wed, 10 Jun 2026 15:31:18 +0200 Subject: [PATCH 5/7] Formatting --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 296717cb6..368d398c3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -54,7 +54,7 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. -- The :class:`DropUninformative` was improved so that `drop_if_constant` becomes a variance threshold and it acts similarly to the VarianceThreshold transformer. +- The :class:`DropUninformative` was improved so that `drop_if_constant` becomes a variance threshold and it acts similarly to the VarianceThreshold transformer. :pr:`2109` by :user:`Janne de Melo Santana `. Bugfixes From 850e104faeb421f14f1ba9db8843195597ea2387 Mon Sep 17 00:00:00 2001 From: Janne Santana Date: Wed, 10 Jun 2026 16:11:06 +0200 Subject: [PATCH 6/7] updating changelog --- CHANGES.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 368d398c3..d4bf5e942 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -54,8 +54,9 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. -- The :class:`DropUninformative` was improved so that `drop_if_constant` becomes a variance threshold and it acts similarly to the VarianceThreshold transformer. - :pr:`2109` by :user:`Janne de Melo Santana `. +- The :class:`DropUninformative` was improved so that `drop_if_constant` becomes a variance + threshold and it acts similarly to the VarianceThreshold transformer. + :pr:`2155` by :user:`Janne de Melo Santana `, :user:`Xixi Khamsane`, :user:`Rim El Khader` Bugfixes -------- From b05e9e06b20dd43301bc874a952a688f1af7549c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eloi=20Massouli=C3=A9?= Date: Wed, 10 Jun 2026 16:26:59 +0200 Subject: [PATCH 7/7] Formatting --- CHANGES.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index d4bf5e942..def453bff 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -54,8 +54,8 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. -- The :class:`DropUninformative` was improved so that `drop_if_constant` becomes a variance - threshold and it acts similarly to the VarianceThreshold transformer. +- The :class:`DropUninformative` was improved so that `drop_if_constant` becomes a variance + threshold and it acts similarly to the VarianceThreshold transformer. :pr:`2155` by :user:`Janne de Melo Santana `, :user:`Xixi Khamsane`, :user:`Rim El Khader` Bugfixes