Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/demo_column_transformer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,9 @@
"outputs": [],
"source": [
"import numpy as np\n",
"from lale.helpers import safe_issubdtype\n",
"num_cols = [col for col in train_X.columns\n",
" if np.issubdtype(train_X.dtypes[col], np.number)]\n",
" if safe_issubdtype(train_X.dtypes[col], np.number)]\n",
"cat_cols = [col for col in train_X.columns if col not in num_cols]"
]
},
Expand Down
6 changes: 4 additions & 2 deletions examples/demo_fairness_datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@
"metadata": {},
"outputs": [],
"source": [
"from lale.helpers import safe_issubdtype\n",
"\n",
"def format_protected_attribute(pattrs, index):\n",
" return \" \" if len(pattrs) <= index else pattrs[index][\"feature\"]\n",
"\n",
Expand All @@ -128,7 +130,7 @@
" \"origin\": dataset_origins[dataset_name],\n",
" \"n_rows\": len(X),\n",
" \"n_cols\": X.shape[1],\n",
" \"any_categorical\": any(not np.issubdtype(t, np.number) for t in X.dtypes),\n",
" \"any_categorical\": any(not safe_issubdtype(t, np.number) for t in X.dtypes),\n",
" \"any_missing\": X.isna().any().any(),\n",
" \"n_labels\": len(y.unique()),\n",
" \"target_name\": y.name,\n",
Expand Down Expand Up @@ -496,7 +498,7 @@
"\n",
"def make_prep(X):\n",
" any_missing = X.isna().any().any()\n",
" cols_num = [c for c, t in zip(X.columns, X.dtypes) if np.issubdtype(t, np.number)]\n",
" cols_num = [c for c, t in zip(X.columns, X.dtypes) if safe_issubdtype(t, np.number)]\n",
" cols_cat = [c for c in X.columns if c not in cols_num]\n",
" if len(cols_num) > 0:\n",
" prep_num = lale.lib.rasl.Project(columns=cols_num)\n",
Expand Down
32 changes: 25 additions & 7 deletions lale/datasets/data_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from typing import Any, List, Literal, Optional, Tuple, Type, Union

import numpy as np
from numpy import issubdtype, ndarray
from numpy import ndarray
from pandas import DataFrame, Series
from pandas.core.groupby import DataFrameGroupBy, SeriesGroupBy
from scipy.sparse import csr_matrix
Expand Down Expand Up @@ -365,24 +365,42 @@ def strip_schema(obj):


def _dtype_to_schema(typ) -> JSON_TYPE:
from lale.helpers import safe_issubdtype

result: JSON_TYPE
if typ is bool or issubdtype(typ, np.bool_):
# Handle pandas extension dtypes (e.g., StringDtype in pandas 3.x)
# These are not np.dtype instances and have a 'name' attribute
if hasattr(typ, "name") and not isinstance(typ, np.dtype):
# Pandas extension dtype - check the name to determine the type
dtype_name = str(typ.name).lower()
if "string" in dtype_name or "str" in dtype_name:
result = {"type": "string"}
elif "int" in dtype_name:
result = {"type": "integer"}
elif "float" in dtype_name or "double" in dtype_name:
result = {"type": "number"}
elif "bool" in dtype_name:
result = {"type": "boolean"}
else:
# Default to string for unknown extension dtypes
result = {"type": "string"}
elif typ is bool or safe_issubdtype(typ, np.bool_):
result = {"type": "boolean"}
elif issubdtype(typ, np.unsignedinteger):
elif safe_issubdtype(typ, np.unsignedinteger):
result = {"type": "integer", "minimum": 0}
elif issubdtype(typ, np.integer):
elif safe_issubdtype(typ, np.integer):
result = {"type": "integer"}
elif issubdtype(typ, np.number):
elif safe_issubdtype(typ, np.number):
result = {"type": "number"}
elif issubdtype(typ, np.str_) or issubdtype(typ, np.bytes_):
elif safe_issubdtype(typ, np.str_) or safe_issubdtype(typ, np.bytes_):
result = {"type": "string"}
elif isinstance(typ, np.dtype):
if typ.fields:
props = {k: _dtype_to_schema(t) for k, t in typ.fields.items()}
result = {"type": "object", "properties": props}
elif typ.shape:
result = _shape_and_dtype_to_schema(typ.shape, typ.subdtype)
elif issubdtype(typ, np.object_):
elif safe_issubdtype(typ, np.object_):
result = {"type": "string"}
else:
assert False, f"unexpected dtype {typ}"
Expand Down
20 changes: 19 additions & 1 deletion lale/datasets/openml/openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,14 @@ def fetch(
y: Optional[Any] = None
if preprocess:
arffData = pd.DataFrame(dataDictionary["data"])
# Convert string columns to object dtype for backward compatibility with pandas 2.x
# In pandas 3.x, string columns use StringDtype by default which causes issues with SimpleImputer
for col in arffData.columns:
if (
hasattr(arffData[col].dtype, "name")
and arffData[col].dtype.name == "string"
):
arffData[col] = arffData[col].astype("object")
# arffData = arffData.fillna(0)
attributes = dataDictionary["attributes"]

Expand Down Expand Up @@ -729,7 +737,9 @@ def fetch(
transformers1 = [
(
"imputer_str",
SimpleImputer(missing_values=None, strategy="most_frequent"),
# Use np.nan for missing_values to handle both None and np.nan
# In pandas 3.x, pd.NA becomes np.nan when converted to object dtype
SimpleImputer(missing_values=np.nan, strategy="most_frequent"),
categorical_cols,
),
("imputer_num", SimpleImputer(strategy="mean"), numeric_cols),
Expand Down Expand Up @@ -780,6 +790,14 @@ def fetch(
else:
col_names = [attr[0].lower() for attr in dataDictionary["attributes"]]
df_all = pd.DataFrame(dataDictionary["data"], columns=col_names)
# Convert string columns to object dtype for backward compatibility with pandas 2.x
# In pandas 3.x, string columns use StringDtype by default which causes issues with sklearn
for col in df_all.columns:
if (
hasattr(df_all[col].dtype, "name")
and df_all[col].dtype.name == "string"
):
df_all[col] = df_all[col].astype("object")
assert target_col in col_names, (target_col, col_names)
y = df_all[target_col]
# the type stubs for pandas are not currently complete enough to type this correctly
Expand Down
32 changes: 29 additions & 3 deletions lale/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,32 @@
torch_cat = None # type: ignore[assignment]
torch_from_numpy = None # type: ignore[assignment]


def safe_issubdtype(typ: Any, dtype_class: Any) -> bool:
"""
Safely check if typ is a subdtype of dtype_class.
Handles pandas extension dtypes (e.g., StringDtype in pandas 3.x) that
np.issubdtype cannot handle.

Parameters
----------
typ
The dtype to check
dtype_class
The dtype class to check against (e.g., np.number, np.integer)

Returns
-------
bool
True if typ is a subtype of dtype_class, False otherwise
"""
try:
return np.issubdtype(typ, dtype_class)
except (TypeError, AttributeError):
# pandas extension dtypes raise TypeError in np.issubdtype
return False


spark_loader = util.find_spec("pyspark")
spark_installed = spark_loader is not None
if spark_installed:
Expand Down Expand Up @@ -179,11 +205,11 @@ def subarray_to_json(indices: Tuple[int, ...]) -> Any:
if len(indices) == len(arr.shape):
if isinstance(arr[indices], (bool, int, float, str)):
return arr[indices]
elif np.issubdtype(arr.dtype, np.bool_):
elif safe_issubdtype(arr.dtype, np.bool_):
return bool(arr[indices])
elif np.issubdtype(arr.dtype, np.integer):
elif safe_issubdtype(arr.dtype, np.integer):
return int(arr[indices])
elif np.issubdtype(arr.dtype, np.number):
elif safe_issubdtype(arr.dtype, np.number):
return float(arr[indices])
elif arr.dtype.kind in ["U", "S", "O"]:
return str(arr[indices])
Expand Down
4 changes: 3 additions & 1 deletion lale/lib/aif360/orbis.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ def _orbis_resample(X, y, diaeresis_y, osizes, nsizes, sampler_hparams):
**{h: v for h, v in sampler_hparams.items() if h not in ["replacement"]},
"sampling_strategy": over_sizes,
}
cats_mask = [not np.issubdtype(typ, np.number) for typ in Xyy.dtypes]
from lale.helpers import safe_issubdtype

cats_mask = [not safe_issubdtype(typ, np.number) for typ in Xyy.dtypes]
if all(cats_mask): # all nominal -> use SMOTEN
over_op = imblearn.over_sampling.SMOTEN(**over_hparams)
elif not any(cats_mask): # all continuous -> use vanilla SMOTE
Expand Down
4 changes: 3 additions & 1 deletion lale/lib/category_encoders/target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,9 @@ def __init__(self, **hyperparams):
def fit(self, X, y):
if catenc_version is None:
raise ValueError("The package 'category_encoders' is not installed.")
if np.issubdtype(y.dtype, np.number):
from lale.helpers import safe_issubdtype

if safe_issubdtype(y.dtype, np.number):
numeric_y = y
else:
from sklearn.preprocessing import LabelEncoder
Expand Down
4 changes: 3 additions & 1 deletion lale/lib/imblearn/smotenc.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@ def __init__(self, operator=None, **hyperparams):
def fit(self, X, y=None):
if self.resampler is None:
if self._hyperparams["categorical_features"] is None:
from lale.helpers import safe_issubdtype

self._hyperparams["categorical_features"] = [
not np.issubdtype(typ, np.number) for typ in X.dtypes
not safe_issubdtype(typ, np.number) for typ in X.dtypes
]
self.resampler = imblearn.over_sampling.SMOTENC(**self._hyperparams)
return super().fit(X, y)
Expand Down
6 changes: 4 additions & 2 deletions lale/lib/rasl/ordinal_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,12 @@ def from_monoid(self, monoid: _OrdinalEncoderMonoid):
def _build_transformer(self):
assert self._monoid is not None

from lale.helpers import safe_issubdtype

def simplify_val(v):
if np.issubdtype(type(v), np.integer):
if safe_issubdtype(type(v), np.integer):
return int(v)
if np.issubdtype(type(v), np.floating):
if safe_issubdtype(type(v), np.floating):
return float(v)
return v

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"jsonsubschema>=0.0.6",
"scikit-learn>=1.0.0,<1.8.0",
"scipy",
"pandas<3.0.0",
"pandas",
"packaging",
"decorator",
"typing-extensions",
Expand Down
63 changes: 44 additions & 19 deletions test/test_aif360.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,7 @@
import numpy as np
import pandas as pd
import sklearn.model_selection

try:
import cvxpy # noqa because the import is only done as a check and flake fails

cvxpy_installed = True
except ImportError:
cvxpy_installed = False

try:
import numba # noqa because the import is only done as a check and flake fails

numba_installed = True
except ImportError:
numba_installed = False

try:
import tensorflow as tf
except ImportError:
tf = None
from packaging import version

import lale.helpers
import lale.lib.aif360
Expand Down Expand Up @@ -76,6 +58,31 @@
)


def _pandas_version_ge_3():
"""Check if pandas version is >= 3.0"""
return version.parse(pd.__version__) >= version.parse("3.0")


try:
import cvxpy # noqa because the import is only done as a check and flake fails

cvxpy_installed = True
except ImportError:
cvxpy_installed = False

try:
import numba # noqa because the import is only done as a check and flake fails

numba_installed = True
except ImportError:
numba_installed = False

try:
import tensorflow as tf
except ImportError:
tf = None


class TestAIF360Datasets(unittest.TestCase):
downloaded_h181 = False
downloaded_h192 = False
Expand Down Expand Up @@ -291,6 +298,12 @@ def test_dataset_meps_panel19_fy2015_pd_cat(self):
)
self._attempt_dataset(X, y, fairness_info, 16578, 1825, {0, 1}, 0.496)

@unittest.skipIf(
_pandas_version_ge_3(),
"MEPS dataset preprocessing with aif360 is incompatible with pandas 3.x due to "
"aif360's internal use of StandardDataset which tries to assign float values to "
"StringDtype columns. This is a limitation in the aif360 library itself.",
)
def test_dataset_meps_panel19_fy2015_pd_num(self):
X, y, fairness_info = lale.lib.aif360.fetch_meps_panel19_fy2015_df(
preprocess=True
Expand All @@ -303,6 +316,12 @@ def test_dataset_meps_panel20_fy2015_pd_cat(self):
)
self._attempt_dataset(X, y, fairness_info, 18849, 1825, {0, 1}, 0.493)

@unittest.skipIf(
_pandas_version_ge_3(),
"MEPS dataset preprocessing with aif360 is incompatible with pandas 3.x due to "
"aif360's internal use of StandardDataset which tries to assign float values to "
"StringDtype columns. This is a limitation in the aif360 library itself.",
)
def test_dataset_meps_panel20_fy2015_pd_num(self):
X, y, fairness_info = lale.lib.aif360.fetch_meps_panel20_fy2015_df(
preprocess=True
Expand All @@ -315,6 +334,12 @@ def test_dataset_meps_panel21_fy2016_pd_cat(self):
)
self._attempt_dataset(X, y, fairness_info, 17052, 1936, {0, 1}, 0.462)

@unittest.skipIf(
_pandas_version_ge_3(),
"MEPS dataset preprocessing with aif360 is incompatible with pandas 3.x due to "
"aif360's internal use of StandardDataset which tries to assign float values to "
"StringDtype columns. This is a limitation in the aif360 library itself.",
)
def test_dataset_meps_panel21_fy2016_pd_num(self):
X, y, fairness_info = lale.lib.aif360.fetch_meps_panel21_fy2016_df(
preprocess=True
Expand Down
Loading