diff --git a/src/evidently/core/metric_types.py b/src/evidently/core/metric_types.py index 62b2b3a90e..531bf89357 100644 --- a/src/evidently/core/metric_types.py +++ b/src/evidently/core/metric_types.py @@ -486,6 +486,12 @@ class Config: count_display_name_template: str = "Missing label {label} count" share_display_name_template: str = "Missing label {label} share" + @validator("counts", "shares", pre=True) + def _convert_label_keys(cls, value): + if not isinstance(value, dict): + return value + return {convert_types(k): v for k, v in value.items()} + def labels(self) -> List[Label]: return list(self.counts.keys()) @@ -551,6 +557,8 @@ def convert_labels(cls, value): def convert_types(val): + if val is pd.NA: + return None if isinstance( val, ( diff --git a/src/evidently/metrics/column_statistics.py b/src/evidently/metrics/column_statistics.py index ddde369af1..6ae47f054b 100644 --- a/src/evidently/metrics/column_statistics.py +++ b/src/evidently/metrics/column_statistics.py @@ -869,9 +869,9 @@ def share_label_display_name(self, label: Label) -> str: return f"Unique Value Share: {self.metric.column} for label {label}" def _all_unique_values(self, current: Dataset, reference: Optional[Dataset]) -> set: - values = set(current.as_dataframe()[self.metric.column].unique()) + values = set(current.as_dataframe()[self.metric.column].dropna().unique()) if reference is not None: - values.update(reference.as_dataframe()[self.metric.column].unique()) + values.update(reference.as_dataframe()[self.metric.column].dropna().unique()) return values def _calculate_value(self, dataset: Dataset, values: set): diff --git a/tests/future/metrics/test_unique_value_count.py b/tests/future/metrics/test_unique_value_count.py index 35b71d5fa2..99c2b8d1d3 100644 --- a/tests/future/metrics/test_unique_value_count.py +++ b/tests/future/metrics/test_unique_value_count.py @@ -1,5 +1,7 @@ +import numpy as np import pandas as pd +from evidently import DataDefinition from evidently import Dataset from evidently import Report from evidently.core.metric_types import ByLabelCountValue @@ -40,3 +42,37 @@ def test_unique_value_count_metric(): assert label_count.display_name == result[label]["count_display_name"] assert label_share.value == result[label]["share"] assert label_share.display_name == result[label]["share_display_name"] + + +def test_unique_value_count_with_pd_na_in_string_dtype(): + """Issue #1616: pd.NA in a nullable-string column must not become a label key.""" + metric = UniqueValueCount(column="col1") + data = pd.DataFrame({"col1": pd.Series(["a", "a", "b", pd.NA], dtype="string")}) + dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"])) + res = Report([metric]).run(dataset, None)._context.get_metric_result(metric) + assert isinstance(res, ByLabelCountValue) + assert set(res.labels()) == {"a", "b"} + a_count, _ = res.get_label_result("a") + b_count, _ = res.get_label_result("b") + assert a_count.value == 2 + assert b_count.value == 1 + + +def test_unique_value_count_with_nan_in_object_dtype(): + """np.nan / None in an object column also must not surface as a label.""" + metric = UniqueValueCount(column="col1") + data = pd.DataFrame({"col1": pd.Series(["a", "b", None, np.nan], dtype="object")}) + dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"])) + res = Report([metric]).run(dataset, None)._context.get_metric_result(metric) + assert isinstance(res, ByLabelCountValue) + assert set(res.labels()) == {"a", "b"} + + +def test_unique_value_count_with_pd_na_in_int64_dtype(): + """pd.NA in a nullable Int64 column.""" + metric = UniqueValueCount(column="col1") + data = pd.DataFrame({"col1": pd.array([1, 2, 2, pd.NA], dtype="Int64")}) + dataset = Dataset.from_pandas(data, DataDefinition(categorical_columns=["col1"])) + res = Report([metric]).run(dataset, None)._context.get_metric_result(metric) + assert isinstance(res, ByLabelCountValue) + assert set(res.labels()) == {1, 2} diff --git a/tests/future/presets/test_dataset_stats_na.py b/tests/future/presets/test_dataset_stats_na.py new file mode 100644 index 0000000000..2020ca0cc3 --- /dev/null +++ b/tests/future/presets/test_dataset_stats_na.py @@ -0,0 +1,22 @@ +import pandas as pd + +from evidently import DataDefinition +from evidently import Dataset +from evidently import Report +from evidently.presets import DataDriftPreset +from evidently.presets import DataSummaryPreset + + +def _make_dataset() -> Dataset: + df = pd.DataFrame({"a": pd.Series(["x", "y", "z", pd.NA], dtype="string")}) + return Dataset.from_pandas(df, DataDefinition(categorical_columns=["a"])) + + +def test_data_summary_preset_handles_pd_na(): + """Issue #1616: DataSummaryPreset must not fail on pd.NA in a categorical column.""" + Report([DataSummaryPreset()]).run(current_data=_make_dataset()) + + +def test_data_drift_preset_handles_pd_na(): + """Issue #1616: DataDriftPreset must not fail on pd.NA in a categorical column.""" + Report([DataDriftPreset()]).run(current_data=_make_dataset(), reference_data=_make_dataset()) diff --git a/tests/future/test_metric_types.py b/tests/future/test_metric_types.py index d6af0c6945..732870eadd 100644 --- a/tests/future/test_metric_types.py +++ b/tests/future/test_metric_types.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import pytest from evidently.core.metric_types import ByLabelCountValue @@ -9,6 +10,8 @@ "input,output", [ ({np.nan: (1.0, 1.0)}, ({"nan": 1.0}, {"nan": 1.0})), + ({pd.NA: (1.0, 1.0)}, ({None: 1.0}, {None: 1.0})), + ({np.int64(7): (3.0, 0.5)}, ({7: 3.0}, {7: 0.5})), ], ) def test_by_label_count_value(input: dict, output: tuple):