Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ Changes
:pr:`2096` by :user:`Ayesha Siddiqua <siddiqua-tamk>`.
- The :class:`TableReport` can now be exported in markdown format with ``.markdown``.
:pr:`2048` by :user:`Riccardo Cappuzzo <rcap107>`.
- The :class:`DropUninformative` was improved so that `drop_if_constant` becomes a variance

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This entry should be updated slightly. Something like:

DropUninformative now has the variance_threshold parameter, which allows to drop numeric and boolean columns whose variance is lower than the given threshold. The default behavior is unchanged.

threshold and it acts similarly to the VarianceThreshold transformer.
:pr:`2155` by :user:`Janne de Melo Santana <jannesantana>`, :user:`Xixi Khamsane<XIXI1234-creator>`, :user:`Rim El Khader<Rim-e>`

Bugfixes
--------
Expand Down
15 changes: 14 additions & 1 deletion skrub/_drop_uninformative.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,12 @@ def __init__(
drop_if_constant=False,
drop_if_unique=False,
drop_null_fraction=1.0,
threshold=0.0,

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
threshold=0.0,
variance_threshold=0.0,

it's better to be explicit here

the parameter should be renamed everywhere

):
self.drop_if_constant = drop_if_constant
self.drop_if_unique = drop_if_unique
self.drop_null_fraction = drop_null_fraction
self.threshold = threshold

def _check_params(self):
if not isinstance(self.drop_if_constant, bool):
Expand Down Expand Up @@ -127,8 +129,19 @@ def _drop_if_too_many_nulls(self, column):

def _drop_if_constant(self, column):
if self.drop_if_constant:
if (sbd.n_unique(column) == 1) and (self._null_count == 0):
if sbd.is_numeric(column) == 1 and (

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_numeric returns false if the column has dtype bool

Suggested change
if sbd.is_numeric(column) == 1 and (
if (sbd.is_numeric(column) or sbd.is_bool(column)) and (

also there is no need to check that it's == 1

self._null_count == 0
): # if numeric or boolean
if (
sbd.std(column) ** 2 <= self.threshold
): # check if passes the threshold
return True
else:
return False
elif (sbd.n_unique(column) == 1) and (self._null_count == 0):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the condition can be simplified a bit by moving the _null_count clause outside

if self.drop_if_constant and self._null_count == 0:
 ... 
if sbd.is_numeric(column) or sbd.is_bool(column):
...
elif sbd.n_unique(column) == 1
...

# use the original logic to deal with the other cases
return True

return False

def _drop_if_unique(self, column):
Expand Down
2 changes: 2 additions & 0 deletions skrub/tests/test_drop_uninformative.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def drop_if_constant_table(df_module):
"const",
None,
],
"low_variance": [0.01, 0.02, 0.05],
}
)

Expand All @@ -141,6 +142,7 @@ def drop_if_constant_table(df_module):
(dict(drop_if_constant=True), "constant_float", []),
(dict(drop_if_constant=True), "constant_float_with_nulls", [2.5, 2.5, np.nan]),
(dict(drop_if_constant=True), "constant_str", []),
(dict(drop_if_constant=True, threshold=0.5), "low_variance", []),
(
dict(drop_if_constant=True),
"constant_str_with_nulls",
Expand Down
Loading