Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
Binary file added .github/.DS_Store
Binary file not shown.
8 changes: 4 additions & 4 deletions .github/workflows/development_CI.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- name: Set up Python 3.8
uses: actions/setup-python@v2
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: "3.12"

- name: Install dependencies
run: |
Expand Down
Empty file added README.md
Empty file.
Binary file added cobra/.DS_Store
Binary file not shown.
14 changes: 7 additions & 7 deletions cobra/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):

auc = float(self.scalar_metrics.loc["AUC"])

with plt.style.context("seaborn-whitegrid"):
with plt.style.context("seaborn-v0_8-whitegrid"):

fig, ax = plt.subplots(figsize=dim)

Expand Down Expand Up @@ -255,7 +255,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):

lifts = np.array(lifts)*inc_rate*100

with plt.style.context("seaborn-ticks"):
with plt.style.context("seaborn-v0_8-ticks"):
fig, ax = plt.subplots(figsize=dim)

plt.bar(x_labels[::-1], lifts, align="center",
Expand Down Expand Up @@ -304,7 +304,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):

x_labels, lifts, _ = self.lift_curve

with plt.style.context("seaborn-ticks"):
with plt.style.context("seaborn-v0_8-ticks"):
fig, ax = plt.subplots(figsize=dim)

plt.bar(x_labels[::-1], lifts, align="center",
Expand Down Expand Up @@ -345,7 +345,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
Tuple with width and length of the plot.
"""

with plt.style.context("seaborn-whitegrid"):
with plt.style.context("seaborn-v0_8-whitegrid"):
fig, ax = plt.subplots(figsize=dim)

ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100,
Expand Down Expand Up @@ -675,7 +675,7 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)):
y_true = self.y_true
y_pred = self.y_pred

with plt.style.context("seaborn-whitegrid"):
with plt.style.context("seaborn-v0_8-whitegrid"):

fig, ax = plt.subplots(figsize=dim)

Expand Down Expand Up @@ -711,7 +711,7 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):

raise NotFittedError(msg.format(self.__class__.__name__))

with plt.style.context("seaborn-whitegrid"):
with plt.style.context("seaborn-v0_8-whitegrid"):

fig, ax = plt.subplots(figsize=dim)

Expand All @@ -733,4 +733,4 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
if path:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")

plt.show()
plt.show()
14 changes: 11 additions & 3 deletions cobra/evaluation/pigs_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ def generate_pig_tables(basetable: pd.DataFrame,
for column_name in sorted(preprocessed_predictors)
if column_name not in no_predictor
]

if len(pigs) == 0:
raise ValueError(
"No preprocessed predictors were provided to generate_pig_tables. "
"Make sure you ran preprocessor.transform(...) successfully and "
"that preprocessed_predictors contains columns ending in '_bin' "
"or '_processed'."
)

output = pd.concat(pigs, ignore_index=True)
return output

Expand Down Expand Up @@ -145,16 +154,15 @@ def plot_incidence(pig_tables: pd.DataFrame,
'the same set of variables.')

df_plot['label'] = df_plot['label'].astype('category')
df_plot['label'].cat.reorder_categories(column_order,
inplace=True)
df_plot['label'] = df_plot['label'].cat.reorder_categories(column_order)

df_plot.sort_values(by=['label'], ascending=True, inplace=True)
df_plot.reset_index(inplace=True)
else:
df_plot.sort_values(by=['avg_target'], ascending=False, inplace=True)
df_plot.reset_index(inplace=True)

with plt.style.context("seaborn-ticks"):
with plt.style.context("seaborn-v0_8-ticks"):
fig, ax = plt.subplots(figsize=dim)

# --------------------------
Expand Down
6 changes: 3 additions & 3 deletions cobra/evaluation/plotting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame,
value_name=metric)

# plot data
with plt.style.context("seaborn-ticks"):
with plt.style.context("seaborn-v0_8-ticks"):
fig, ax = plt.subplots(figsize=dim)

ax = sns.barplot(x=metric, y="predictor", hue="split", data=df)
Expand Down Expand Up @@ -122,7 +122,7 @@ def plot_performance_curves(model_performance: pd.DataFrame,
max(model_performance['selection_performance']),
max(model_performance['validation_performance'])), 1)

with plt.style.context("seaborn-whitegrid"):
with plt.style.context("seaborn-v0_8-whitegrid"):

fig, ax = plt.subplots(figsize=dim)

Expand Down Expand Up @@ -178,7 +178,7 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame,
path : str, optional
Path to store the figure.
"""
with plt.style.context("seaborn-ticks"):
with plt.style.context("seaborn-v0_8-ticks"):
fig, ax = plt.subplots(figsize=dim)
ax = sns.barplot(x="importance", y="predictor",
data=df_variable_importance,
Expand Down
9 changes: 7 additions & 2 deletions cobra/preprocessing/categorical_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,10 @@ def _replace_missings(data: pd.DataFrame,
temp = data[column_names]
else:
temp = data.copy()

# Cast to object first so mixed/string replacements remain valid
# for numeric and boolean categorical columns under newer pandas.
temp = temp.astype(object)
temp = temp.fillna("Missing")
temp = temp.replace(regex, "")
temp = temp.replace("", "Missing")
Expand Down Expand Up @@ -462,7 +466,7 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,

if model_type == "classification":
contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"],
margins=False)
margins=False).astype(np.float64)

# if true, we scale the "other" categories
if scale_contingency_table:
Expand All @@ -471,7 +475,8 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,

contingency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats
contingency_table.iloc[1, 1] = incidence_mean * size_other_cats
contingency_table = contingency_table.values.astype(np.int64)

contingency_table = contingency_table.to_numpy(dtype=np.float64)

pval = stats.chi2_contingency(contingency_table, correction=False)[1]

Expand Down
19 changes: 10 additions & 9 deletions cobra/preprocessing/kbins_discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,24 +314,25 @@ def _transform_column(self, data: pd.DataFrame,

column_name_bin = column_name + "_bin"

# use pd.cut to compute bins
data[column_name_bin] = pd.cut(x=data[column_name],
bins=interval_idx)
# Build the categorical Series fully first, then assign it once.
# Newer pandas is stricter about overwriting an existing categorical
# column with a different set of categories.
binned = pd.cut(x=data[column_name], bins=interval_idx)

# Rename bins so that the output has a proper format
bin_labels = self._create_bin_labels(bins)
binned = binned.cat.rename_categories(bin_labels)

data[column_name_bin] = (data[column_name_bin]
.cat.rename_categories(bin_labels))

if data[column_name_bin].isnull().sum() > 0:
if binned.isnull().sum() > 0:

# Add an additional bin for missing values
data[column_name_bin]=data[column_name_bin].cat.add_categories(["Missing"])
binned = binned.cat.add_categories(["Missing"])

# Replace NULL with "Missing"
# Otherwise these will be ignored in groupby
data[column_name_bin].fillna("Missing", inplace=True)
binned = binned.fillna("Missing")

data[column_name_bin] = binned

return data

Expand Down
50 changes: 31 additions & 19 deletions cobra/preprocessing/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,30 +249,42 @@ def get_continuous_and_discrete_columns(
"id_col_name is equal to None. If there is no id column ignore this warning"
)

# find continuous_vars and discrete_vars in the dateframe
col_dtypes = df.dtypes
excluded_columns = {id_col_name, target_column_name}

discrete_vars = [
col
for col in col_dtypes[col_dtypes == object].index.tolist()
if col not in [id_col_name, target_column_name]
for col in df.columns
if col not in excluded_columns
and (
pd.api.types.is_object_dtype(df[col])
or pd.api.types.is_string_dtype(df[col])
or isinstance(df[col].dtype, pd.CategoricalDtype)
or pd.api.types.is_bool_dtype(df[col])
)
]

for col in df.columns:
if col not in discrete_vars and col not in [
id_col_name,
target_column_name,
]: # omit discrete because a string, and target
val_counts = df[col].nunique()
if (
val_counts > 1 and val_counts <= 10
): # the column contains less than 10 different values
discrete_vars.append(col)

continuous_vars = list(
set(df.columns)
- set(discrete_vars)
- set([id_col_name, target_column_name])
)
if col in discrete_vars or col in excluded_columns:
continue
if not pd.api.types.is_numeric_dtype(df[col]):
continue
if pd.api.types.is_bool_dtype(df[col]):
continue

val_counts = df[col].nunique()
if (
val_counts > 1 and val_counts <= 10
): # the column contains less than 10 different values
discrete_vars.append(col)

continuous_vars = [
col
for col in df.columns
if col not in excluded_columns
and col not in discrete_vars
and pd.api.types.is_numeric_dtype(df[col])
and not pd.api.types.is_bool_dtype(df[col])
]
log.warning(
f"""Cobra automaticaly assumes that following variables are
discrete: {discrete_vars}
Expand Down
24 changes: 10 additions & 14 deletions cobra/preprocessing/target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,29 +270,25 @@ def _transform_column(self, data: pd.DataFrame,
_data = data.copy()
new_column = TargetEncoder._clean_column_name(column_name)

# Convert dtype to float, because when the original dtype
# is of type "category", the resulting dtype would otherwise also be of
# type "category":
_data[new_column] = (_data[column_name].map(self._mapping[column_name])
.astype("float"))
# Convert dtype to float up front so encoded values are written into
# a fresh float Series, which avoids dtype collisions on newer pandas.
encoded = _data[column_name].map(self._mapping[column_name]).astype("float")

# In case of categorical data, it could be that new categories will
# emerge which were not present in the train set, so this will result
# in missing values, which should be replaced according to the
# configured imputation strategy:
if _data[new_column].isnull().sum() > 0:
if encoded.isnull().sum() > 0:
if self.imputation_strategy == "mean":
_data[new_column].fillna(self._global_mean,
inplace=True)
encoded = encoded.fillna(self._global_mean)
elif self.imputation_strategy == "min":
_data[new_column].fillna(_data[new_column].min(),
inplace=True)
encoded = encoded.fillna(encoded.min())
elif self.imputation_strategy == "max":
_data[new_column].fillna(_data[new_column].max(),
inplace=True)
encoded = encoded.fillna(encoded.max())
elif self.imputation_strategy == "median":
_data[new_column].fillna(_data[new_column].median(),
inplace=True)
encoded = encoded.fillna(encoded.median())

_data[new_column] = encoded

return _data

Expand Down
2 changes: 1 addition & 1 deletion cobra/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.1.1"
__version__ = "1.1.2"
6 changes: 6 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
def main():
print("Hello from cobra!")


if __name__ == "__main__":
main()
18 changes: 18 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[project]
name = "cobra"
version = "1.1.2"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"ipykernel>=7.2.0",
"jupyter>=1.1.1",
"matplotlib>=3.8.0",
"numpy>=1.26.0",
"pandas>=2.1.0",
"pythonpredictions-cobra>=1.1.0",
"scikit-learn>=1.2.0",
"scipy>=1.11.2",
"seaborn>=0.13.2",
"tqdm>=4.62.2",
]
7 changes: 7 additions & 0 deletions requirements copy.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
numpy>=1.19.4
pandas>=1.1.5,<2.0.0
scipy>=1.5.4
scikit-learn>=1.2.0
matplotlib>=3.4.3
seaborn>=0.11.0
tqdm>=4.62.2
12 changes: 6 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy>=1.19.4
pandas>=1.1.5,<2.0.0
scipy>=1.5.4
numpy>=1.26.0
pandas>=2.1.0
scipy>=1.11.2
scikit-learn>=1.2.0
matplotlib>=3.4.3
seaborn>=0.11.0
tqdm>=4.62.2
matplotlib>=3.8.0
seaborn>=0.13.2
tqdm>=4.62.2
13 changes: 7 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@
license="MIT",
author="Python Predictions",
author_email="cobra@pythonpredictions.com",
python_requires=">=3.10",
install_requires=[
"numpy>=1.19.4",
"pandas>=1.1.5,<2.0.0",
"scipy>=1.5.4",
"scikit-learn>=0.24.1",
"matplotlib>=3.4.3",
"seaborn>=0.11.0",
"numpy>=1.26.0",
"pandas>=2.1.0",
"scipy>=1.11.2",
"scikit-learn>=1.2.0",
"matplotlib>=3.8.0",
"seaborn>=0.13.2",
"tqdm>=4.62.2"]
)
Binary file added tests/.DS_Store
Binary file not shown.
Loading
Loading